/* * Ebizzy - replicate a large ebusiness type of workload. * * Written by Valerie Henson * * Copyright 2006 - 2007 Intel Corporation * Copyright 2007 Valerie Henson * * Rodrigo Rubira Branco - HP/BSD/Solaris port and some * new features * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * */ /* * This program is designed to replicate a common web search app * workload. A lot of search applications have the basic pattern: Get * a request to find a certain record, index into the chunk of memory * that contains it, copy it into another chunk, then look it up via * binary search. The interesting parts of this workload are: * * Large working set * Data alloc/copy/free cycle * Unpredictable data access patterns * * Fiddle with the command line options until you get something * resembling the kind of workload you want to investigate. * */ #include #include #include #include #include #include #include #include #include #include #include "ebizzy.h" /* * Command line options */ static unsigned int always_mmap; static unsigned int never_mmap; static unsigned int chunks; static unsigned int use_permissions; static unsigned int use_holes; static unsigned int random_size; static unsigned int chunk_size; static unsigned int seconds; static unsigned int threads; static unsigned int verbose; static unsigned int linear; static unsigned int touch_pages; static unsigned int no_lib_memcpy; /* * Other global variables */ typedef size_t record_t; static unsigned int record_size = sizeof(record_t); static char *cmd; static record_t **mem; static char **hole_mem; static unsigned int page_size; static time_t start_time; static volatile int threads_go; static void usage(void) { fprintf(stderr, "Usage: %s [options]\n" "-T\t\t Just 'touch' the allocated pages\n" "-l\t\t Don't use library memcpy\n" "-m\t\t Always use mmap instead of malloc\n" "-M\t\t Never use mmap\n" "-n \t Number of memory chunks to allocate\n" "-p \t\t Prevent mmap coalescing using permissions\n" "-P \t\t Prevent mmap coalescing using holes\n" "-R\t\t Randomize size of memory to copy and search\n" "-s \t Size of memory chunks, in bytes\n" "-S \t Number of seconds to run\n" "-t \t Number of threads (2 * number cpus by default)\n" "-v[v[v]]\t Be verbose (more v's for more verbose)\n" "-z\t\t Linear search instead of binary search\n", cmd); exit(1); } /* * Read options, check them, and set some defaults. */ static void read_options(int argc, char *argv[]) { int c; page_size = getpagesize(); /* * Set some defaults. These are currently tuned to run in a * reasonable amount of time on my laptop. * * We could set the static defaults in the declarations, but * then the defaults would be split between here and the top * of the file, which is annoying. */ threads = 2 * sysconf(_SC_NPROCESSORS_ONLN); chunks = 10; chunk_size = record_size * 64 * 1024; seconds = 10; /* On to option processing */ cmd = argv[0]; opterr = 1; while ((c = getopt(argc, argv, "lmMn:pPRs:S:t:vzT")) != -1) { switch (c) { case 'l': no_lib_memcpy = 1; break; case 'm': always_mmap = 1; break; case 'M': never_mmap = 1; break; case 'n': chunks = atoi(optarg); if (chunks == 0) usage(); break; case 'p': use_permissions = 1; break; case 'P': use_holes = 1; break; case 'R': random_size = 1; break; case 's': chunk_size = atoi(optarg); if (chunk_size == 0) usage(); break; case 'S': seconds = atoi(optarg); if (seconds == 0) usage(); break; case 't': threads = atoi(optarg); if (threads == 0) usage(); break; case 'T': touch_pages = 1; break; case 'v': verbose++; break; case 'z': linear = 1; break; default: usage(); } } if (verbose) printf("ebizzy 0.2\n" "(C) 2006-7 Intel Corporation\n" "(C) 2007 Valerie Henson \n"); if (verbose) { printf("always_mmap %u\n", always_mmap); printf("never_mmap %u\n", never_mmap); printf("chunks %u\n", chunks); printf("prevent coalescing using permissions %u\n", use_permissions); printf("prevent coalescing using holes %u\n", use_holes); printf("random_size %u\n", random_size); printf("chunk_size %u\n", chunk_size); printf("seconds %d\n", seconds); printf("threads %u\n", threads); printf("verbose %u\n", verbose); printf("linear %u\n", linear); printf("touch_pages %u\n", touch_pages); printf("page size %d\n", page_size); } /* Check for incompatible options */ if (always_mmap && never_mmap) { fprintf(stderr, "Both -m \"always mmap\" and -M " "\"never mmap\" option specified\n"); usage(); } #ifdef __GLIBC__ if (never_mmap) mallopt(M_MMAP_MAX, 0); #endif if (chunk_size < record_size) { fprintf(stderr, "Chunk size %u smaller than record size %u\n", chunk_size, record_size); usage(); } } static void touch_mem(char *dest, size_t size) { int i; if (touch_pages) { for (i = 0; i < (long)size; i += page_size) *(dest + i) = 0xff; } } static void *alloc_mem(size_t size) { char *p; int err = 0; if (always_mmap) { p = mmap(NULL, size, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANONYMOUS), -1, 0); if (p == MAP_FAILED) err = 1; } else { p = malloc(size); if (p == NULL) err = 1; } if (err) { fprintf(stderr, "Couldn't allocate %zu bytes, try smaller " "chunks or size options\n" "Using -n %u chunks and -s %u size\n", size, chunks, chunk_size); exit(1); } return (p); } static void free_mem(void *p, size_t size) { if (always_mmap) munmap(p, size); else free(p); } /* * Factor out differences in memcpy implementation by optionally using * our own simple memcpy implementation. */ static void my_memcpy(void *dest, void *src, size_t len) { char *d = (char *)dest; char *s = (char *)src; int i; for (i = 0; i < (long)len; i++) d[i] = s[i]; return; } static void allocate(void) { unsigned int i; mem = alloc_mem(chunks * sizeof(record_t *)); if (use_holes) hole_mem = alloc_mem(chunks * sizeof(record_t *)); for (i = 0; i < chunks; i++) { mem[i] = (record_t *) alloc_mem(chunk_size); /* Prevent coalescing using holes */ if (use_holes) hole_mem[i] = alloc_mem(page_size); } /* Free hole memory */ if (use_holes) for (i = 0; i < chunks; i++) free_mem(hole_mem[i], page_size); if (verbose) printf("Allocated memory\n"); } static void write_pattern(void) { unsigned int i, j; for (i = 0; i < chunks; i++) { for (j = 0; j < chunk_size / record_size; j++) mem[i][j] = (record_t) j; /* Prevent coalescing by alternating permissions */ if (use_permissions && (i % 2) == 0) mprotect((void *)mem[i], chunk_size, PROT_READ); } if (verbose) printf("Wrote memory\n"); } static void *linear_search(record_t key, record_t * base, size_t size) { record_t *p; record_t *end = base + (size / record_size); for (p = base; p < end; p++) if (*p == key) return p; return NULL; } static int compare(const void *p1, const void *p2) { return (*(record_t *) p1 - *(record_t *) p2); } /* * Stupid ranged random number function. We don't care about quality. * * Inline because it's starting to be a scaling issue. */ static inline unsigned int rand_num(unsigned int max, unsigned int *state) { *state *= 1103515245 + 12345; return ((*state / 65536) % max); } /* * This function is the meat of the program; the rest is just support. * * In this function, we randomly select a memory chunk, copy it into a * newly allocated buffer, randomly select a search key, look it up, * then free the memory. An option tells us to allocate and copy a * randomly sized chunk of the memory instead of the whole thing. * * Linear search provided for sanity checking. * */ static uintptr_t search_mem(void) { record_t key, *found; record_t *src, *copy; unsigned int chunk; size_t copy_size = chunk_size; uintptr_t i; unsigned int state = 0; for (i = 0; threads_go == 1; i++) { chunk = rand_num(chunks, &state); src = mem[chunk]; /* * If we're doing random sizes, we need a non-zero * multiple of record size. */ if (random_size) copy_size = (rand_num(chunk_size / record_size, &state) + 1) * record_size; copy = alloc_mem(copy_size); if (touch_pages) { touch_mem((char *)copy, copy_size); } else { if (no_lib_memcpy) my_memcpy(copy, src, copy_size); else memcpy(copy, src, copy_size); key = rand_num(copy_size / record_size, &state); if (verbose > 2) printf("Search key %zu, copy size %zu\n", key, copy_size); if (linear) found = linear_search(key, copy, copy_size); else found = bsearch(&key, copy, copy_size / record_size, record_size, compare); /* Below check is mainly for memory corruption or other bug */ if (found == NULL) { fprintf(stderr, "Couldn't find key %zd\n", key); exit(1); } } /* end if ! touch_pages */ free_mem(copy, copy_size); } return (i); } static void *thread_run(void *arg __attribute__((unused))) { uintptr_t records_thread; if (verbose > 1) printf("Thread started\n"); /* Wait for the start signal */ while (threads_go == 0) ; records_thread = search_mem(); if (verbose > 1) printf("Thread finished, %f seconds\n", difftime(time(NULL), start_time)); return (void *)records_thread; } static struct timeval difftimeval(struct timeval *end, struct timeval *start) { struct timeval diff; diff.tv_sec = end->tv_sec - start->tv_sec; diff.tv_usec = end->tv_usec - start->tv_usec; return diff; } static void start_threads(void) { pthread_t thread_array[threads]; double elapsed; unsigned int i; struct rusage start_ru, end_ru; struct timeval usr_time, sys_time; double records_per_sec = 0.0; int err; if (verbose) printf("Threads starting\n"); for (i = 0; i < threads; i++) { err = pthread_create(&thread_array[i], NULL, thread_run, NULL); if (err) { fprintf(stderr, "Error creating thread %d\n", i); exit(1); } } /* * Begin accounting - this is when we actually do the things * we want to measure. */ getrusage(RUSAGE_SELF, &start_ru); start_time = time(NULL); threads_go = 1; sleep(seconds); threads_go = 0; elapsed = difftime(time(NULL), start_time); getrusage(RUSAGE_SELF, &end_ru); /* * The rest is just clean up. */ for (i = 0; i < threads; i++) { uintptr_t record_thread; err = pthread_join(thread_array[i], (void *)&record_thread); if (err) { fprintf(stderr, "Error joining thread %d\n", i); exit(1); } records_per_sec += ((double)record_thread / elapsed); } if (verbose) printf("Threads finished\n"); printf("%tu records/s\n", (uintptr_t) records_per_sec); usr_time = difftimeval(&end_ru.ru_utime, &start_ru.ru_utime); sys_time = difftimeval(&end_ru.ru_stime, &start_ru.ru_stime); printf("real %5.2f s\n", elapsed); printf("user %5.2f s\n", usr_time.tv_sec + usr_time.tv_usec / 1e6); printf("sys %5.2f s\n", sys_time.tv_sec + sys_time.tv_usec / 1e6); } int main(int argc, char *argv[]) { read_options(argc, argv); allocate(); write_pattern(); start_threads(); return 0; }