1Index: linux/arch/i386/Kconfig.debug 2=================================================================== 3--- linux.orig/arch/i386/Kconfig.debug 2005-11-18 14:59:18.000000000 -0800 4+++ linux/arch/i386/Kconfig.debug 2005-11-18 14:59:31.000000000 -0800 5@@ -62,6 +62,13 @@ 6 on the VM subsystem for higher order allocations. This option 7 will also use IRQ stacks to compensate for the reduced stackspace. 8 9+config LOCKMETER 10+ bool "Kernel lock metering" 11+ depends on SMP 12+ help 13+ Say Y to enable kernel lock metering, which adds overhead to SMP locks, 14+ but allows you to see various statistics using the lockstat command. 15+ 16 config X86_FIND_SMP_CONFIG 17 bool 18 depends on X86_LOCAL_APIC || X86_VOYAGER 19Index: linux/arch/ia64/Kconfig.debug 20=================================================================== 21--- linux.orig/arch/ia64/Kconfig.debug 2005-11-18 14:59:18.000000000 -0800 22+++ linux/arch/ia64/Kconfig.debug 2005-11-18 14:59:31.000000000 -0800 23@@ -72,4 +72,11 @@ 24 depends on COMPAT && SYSVIPC 25 default y 26 27+config LOCKMETER 28+ bool "Kernel lock metering" 29+ depends on SMP 30+ help 31+ Say Y to enable kernel lock metering, which adds overhead to SMP locks, 32+ but allows you to see various statistics using the lockstat command. 33+ 34 endmenu 35Index: linux/fs/proc/proc_misc.c 36=================================================================== 37--- linux.orig/fs/proc/proc_misc.c 2005-11-18 14:59:20.000000000 -0800 38+++ linux/fs/proc/proc_misc.c 2005-11-18 14:59:31.000000000 -0800 39@@ -563,6 +563,36 @@ 40 entry->proc_fops = f; 41 } 42 43+#ifdef CONFIG_LOCKMETER 44+extern ssize_t get_lockmeter_info(char *, size_t, loff_t *); 45+extern ssize_t put_lockmeter_info(const char *, size_t); 46+extern int get_lockmeter_info_size(void); 47+ 48+/* 49+ * This function accesses lock metering information. 50+ */ 51+static ssize_t read_lockmeter(struct file *file, char *buf, 52+ size_t count, loff_t *ppos) 53+{ 54+ return get_lockmeter_info(buf, count, ppos); 55+} 56+ 57+/* 58+ * Writing to /proc/lockmeter resets the counters 59+ */ 60+static ssize_t write_lockmeter(struct file * file, const char * buf, 61+ size_t count, loff_t *ppos) 62+{ 63+ return put_lockmeter_info(buf, count); 64+} 65+ 66+static struct file_operations proc_lockmeter_operations = { 67+ NULL, /* lseek */ 68+ read: read_lockmeter, 69+ write: write_lockmeter, 70+}; 71+#endif /* CONFIG_LOCKMETER */ 72+ 73 void __init proc_misc_init(void) 74 { 75 struct proc_dir_entry *entry; 76@@ -629,6 +659,13 @@ 77 if (entry) 78 entry->proc_fops = &proc_sysrq_trigger_operations; 79 #endif 80+#ifdef CONFIG_LOCKMETER 81+ entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL); 82+ if (entry) { 83+ entry->proc_fops = &proc_lockmeter_operations; 84+ entry->size = get_lockmeter_info_size(); 85+ } 86+#endif 87 #ifdef CONFIG_PPC32 88 { 89 extern struct file_operations ppc_htab_operations; 90Index: linux/include/asm-alpha/lockmeter.h 91=================================================================== 92--- /dev/null 1970-01-01 00:00:00.000000000 +0000 93+++ linux/include/asm-alpha/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 94@@ -0,0 +1,42 @@ 95+/* 96+ * Written by John Hawkes (hawkes@sgi.com) 97+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 98+ * 99+ * Modified by Peter Rival (frival@zk3.dec.com) 100+ */ 101+ 102+#ifndef _ALPHA_LOCKMETER_H 103+#define _ALPHA_LOCKMETER_H 104+ 105+#include <asm/hwrpb.h> 106+#define CPU_CYCLE_FREQUENCY hwrpb->cycle_freq 107+ 108+#define get_cycles64() get_cycles() 109+ 110+#define THIS_CPU_NUMBER smp_processor_id() 111+ 112+#include <linux/version.h> 113+ 114+#define SPINLOCK_MAGIC_INIT /**/ 115+ 116+/* 117+ * return true if rwlock is write locked 118+ * (note that other lock attempts can cause the lock value to be negative) 119+ */ 120+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1) 121+#define IABS(x) ((x) > 0 ? (x) : -(x)) 122+ 123+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 124+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 125+{ 126+ int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->raw_lock.lock; 127+ /* readers subtract 2, so we have to: */ 128+ /* - andnot off a possible writer (bit 0) */ 129+ /* - get the absolute value */ 130+ /* - divide by 2 (right shift by one) */ 131+ /* to find the number of readers */ 132+ if (tmp == 0) return(0); 133+ else return(IABS(tmp & ~1)>>1); 134+} 135+ 136+#endif /* _ALPHA_LOCKMETER_H */ 137Index: linux/include/asm-i386/lockmeter.h 138=================================================================== 139--- /dev/null 1970-01-01 00:00:00.000000000 +0000 140+++ linux/include/asm-i386/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 141@@ -0,0 +1,77 @@ 142+/* 143+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 144+ * 145+ * Written by John Hawkes (hawkes@sgi.com) 146+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 147+ * 148+ * Modified by Ray Bryant (raybry@us.ibm.com) 149+ * Changes Copyright (C) 2000 IBM, Inc. 150+ * Added save of index in spinlock_t to improve efficiency 151+ * of "hold" time reporting for spinlocks. 152+ * Added support for hold time statistics for read and write 153+ * locks. 154+ * Moved machine dependent code here from include/lockmeter.h. 155+ * 156+ */ 157+ 158+#ifndef _I386_LOCKMETER_H 159+#define _I386_LOCKMETER_H 160+ 161+#include <asm/spinlock.h> 162+#include <asm/rwlock.h> 163+ 164+#include <linux/version.h> 165+ 166+#ifdef __KERNEL__ 167+extern unsigned int cpu_khz; 168+#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) 169+#else 170+#define CPU_CYCLE_FREQUENCY 450000000 171+#endif 172+ 173+#define THIS_CPU_NUMBER smp_processor_id() 174+ 175+/* 176+ * return the number of readers for a rwlock_t 177+ */ 178+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 179+ 180+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 181+{ 182+ int tmp = (int) rwlock_ptr->raw_lock.lock; 183+ /* read and write lock attempts may cause the lock value to temporarily */ 184+ /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ 185+ /* is -1 because it was write locked and somebody tried to read lock it */ 186+ /* or if it is -1 because it was read locked and somebody tried to write*/ 187+ /* lock it. ........................................................... */ 188+ do { 189+ tmp = (int) rwlock_ptr->raw_lock.lock; 190+ } while (tmp < 0); 191+ if (tmp == 0) return(0); 192+ else return(RW_LOCK_BIAS-tmp); 193+} 194+ 195+/* 196+ * return true if rwlock is write locked 197+ * (note that other lock attempts can cause the lock value to be negative) 198+ */ 199+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock <= 0) 200+#define IABS(x) ((x) > 0 ? (x) : -(x)) 201+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->raw_lock.lock) % RW_LOCK_BIAS) != 0) 202+ 203+/* this is a lot of typing just to get gcc to emit "rdtsc" */ 204+static inline long long get_cycles64 (void) 205+{ 206+ union longlong_u { 207+ long long intlong; 208+ struct intint_s { 209+ uint32_t eax; 210+ uint32_t edx; 211+ } intint; 212+ } longlong; 213+ 214+ rdtsc(longlong.intint.eax,longlong.intint.edx); 215+ return longlong.intlong; 216+} 217+ 218+#endif /* _I386_LOCKMETER_H */ 219Index: linux/include/asm-ia64/lockmeter.h 220=================================================================== 221--- /dev/null 1970-01-01 00:00:00.000000000 +0000 222+++ linux/include/asm-ia64/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 223@@ -0,0 +1,33 @@ 224+/* 225+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 226+ * 227+ * Written by John Hawkes (hawkes@sgi.com) 228+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 229+ */ 230+ 231+#ifndef _IA64_LOCKMETER_H 232+#define _IA64_LOCKMETER_H 233+ 234+#ifdef local_cpu_data 235+#define CPU_CYCLE_FREQUENCY local_cpu_data->itc_freq 236+#else 237+#define CPU_CYCLE_FREQUENCY my_cpu_data.itc_freq 238+#endif 239+#define get_cycles64() get_cycles() 240+ 241+#define THIS_CPU_NUMBER smp_processor_id() 242+ 243+/* 244+ * return the number of readers for a rwlock_t 245+ */ 246+#define RWLOCK_READERS(rwlock_ptr) ((rwlock_ptr)->raw_lock.read_counter) 247+ 248+/* 249+ * return true if rwlock is write locked 250+ * (note that other lock attempts can cause the lock value to be negative) 251+ */ 252+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.write_lock) 253+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.read_counter) 254+ 255+#endif /* _IA64_LOCKMETER_H */ 256+ 257Index: linux/include/asm-mips/lockmeter.h 258=================================================================== 259--- /dev/null 1970-01-01 00:00:00.000000000 +0000 260+++ linux/include/asm-mips/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 261@@ -0,0 +1,91 @@ 262+/* 263+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 264+ * 265+ * Written by John Hawkes (hawkes@sgi.com) 266+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 267+ * Ported to mips32 for Asita Technologies 268+ * by D.J. Barrow ( dj.barrow@asitatechnologies.com ) 269+ */ 270+#ifndef _ASM_LOCKMETER_H 271+#define _ASM_LOCKMETER_H 272+ 273+/* do_gettimeoffset is a function pointer on mips */ 274+/* & it is not included by <linux/time.h> */ 275+#include <asm/time.h> 276+#include <linux/time.h> 277+#include <asm/div64.h> 278+ 279+#define SPINLOCK_MAGIC_INIT /* */ 280+ 281+#define CPU_CYCLE_FREQUENCY get_cpu_cycle_frequency() 282+ 283+#define THIS_CPU_NUMBER smp_processor_id() 284+ 285+static uint32_t cpu_cycle_frequency = 0; 286+ 287+static uint32_t get_cpu_cycle_frequency(void) 288+{ 289+ /* a total hack, slow and invasive, but ... it works */ 290+ int sec; 291+ uint32_t start_cycles; 292+ struct timeval tv; 293+ 294+ if (cpu_cycle_frequency == 0) { /* uninitialized */ 295+ do_gettimeofday(&tv); 296+ sec = tv.tv_sec; /* set up to catch the tv_sec rollover */ 297+ while (sec == tv.tv_sec) { do_gettimeofday(&tv); } 298+ sec = tv.tv_sec; /* rolled over to a new sec value */ 299+ start_cycles = get_cycles(); 300+ while (sec == tv.tv_sec) { do_gettimeofday(&tv); } 301+ cpu_cycle_frequency = get_cycles() - start_cycles; 302+ } 303+ 304+ return cpu_cycle_frequency; 305+} 306+ 307+extern struct timeval xtime; 308+ 309+static uint64_t get_cycles64(void) 310+{ 311+ static uint64_t last_get_cycles64 = 0; 312+ uint64_t ret; 313+ unsigned long sec; 314+ unsigned long usec, usec_offset; 315+ 316+again: 317+ sec = xtime.tv_sec; 318+ usec = xtime.tv_usec; 319+ usec_offset = do_gettimeoffset(); 320+ if ((xtime.tv_sec != sec) || 321+ (xtime.tv_usec != usec)|| 322+ (usec_offset >= 20000)) 323+ goto again; 324+ 325+ ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency); 326+ /* We can't do a normal 64 bit division on mips without libgcc.a */ 327+ do_div(ret,1000000); 328+ ret += ((uint64_t)sec * cpu_cycle_frequency); 329+ 330+ /* XXX why does time go backwards? do_gettimeoffset? general time adj? */ 331+ if (ret <= last_get_cycles64) 332+ ret = last_get_cycles64+1; 333+ last_get_cycles64 = ret; 334+ 335+ return ret; 336+} 337+ 338+/* 339+ * return the number of readers for a rwlock_t 340+ */ 341+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 342+ 343+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 344+{ 345+ int tmp = (int) rwlock_ptr->raw_lock.lock; 346+ return (tmp >= 0) ? tmp : 0; 347+} 348+ 349+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock < 0) 350+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock > 0) 351+ 352+#endif /* _ASM_LOCKMETER_H */ 353Index: linux/include/asm-sparc64/lockmeter.h 354=================================================================== 355--- /dev/null 1970-01-01 00:00:00.000000000 +0000 356+++ linux/include/asm-sparc64/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 357@@ -0,0 +1,37 @@ 358+/* 359+ * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com) 360+ * Copyright (C) 2003 David S. Miller (davem@redhat.com) 361+ */ 362+ 363+#ifndef _SPARC64_LOCKMETER_H 364+#define _SPARC64_LOCKMETER_H 365+ 366+#include <linux/smp.h> 367+#include <asm/spinlock.h> 368+#include <asm/timer.h> 369+#include <asm/timex.h> 370+ 371+/* Actually, this is not the CPU frequency by the system tick 372+ * frequency which is good enough for lock metering. 373+ */ 374+#define CPU_CYCLE_FREQUENCY (timer_tick_offset * HZ) 375+#define THIS_CPU_NUMBER smp_processor_id() 376+ 377+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 378+ 379+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 380+{ 381+ signed int tmp = rwlock_ptr->raw_lock.lock; 382+ 383+ if (tmp > 0) 384+ return tmp; 385+ else 386+ return 0; 387+} 388+ 389+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->raw_lock.lock) < 0) 390+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)((rwlock_ptr)->raw_lock.lock) > 0) 391+ 392+#define get_cycles64() get_cycles() 393+ 394+#endif /* _SPARC64_LOCKMETER_H */ 395Index: linux/include/asm-x86_64/lockmeter.h 396=================================================================== 397--- /dev/null 1970-01-01 00:00:00.000000000 +0000 398+++ linux/include/asm-x86_64/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 399@@ -0,0 +1,64 @@ 400+/* 401+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 402+ * 403+ * Written by John Hawkes (hawkes@sgi.com) 404+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 405+ * 406+ * Modified by Ray Bryant (raybry@us.ibm.com) 407+ * Changes Copyright (C) 2000 IBM, Inc. 408+ * Added save of index in spinlock_t to improve efficiency 409+ * of "hold" time reporting for spinlocks. 410+ * Added support for hold time statistics for read and write 411+ * locks. 412+ * Moved machine dependent code here from include/lockmeter.h. 413+ * 414+ */ 415+ 416+#ifndef _X8664_LOCKMETER_H 417+#define _X8664_LOCKMETER_H 418+ 419+#include <asm/spinlock.h> 420+#include <asm/rwlock.h> 421+ 422+#include <linux/version.h> 423+ 424+#ifdef __KERNEL__ 425+extern unsigned int cpu_khz; 426+#define CPU_CYCLE_FREQUENCY (cpu_khz * 1000) 427+#else 428+#define CPU_CYCLE_FREQUENCY 450000000 429+#endif 430+ 431+#define THIS_CPU_NUMBER smp_processor_id() 432+ 433+/* 434+ * return the number of readers for a rwlock_t 435+ */ 436+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 437+ 438+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 439+{ 440+ int tmp = (int) rwlock_ptr->raw_lock.lock; 441+ /* read and write lock attempts may cause the lock value to temporarily */ 442+ /* be negative. Until it is >= 0 we know nothing (i. e. can't tell if */ 443+ /* is -1 because it was write locked and somebody tried to read lock it */ 444+ /* or if it is -1 because it was read locked and somebody tried to write*/ 445+ /* lock it. ........................................................... */ 446+ do { 447+ tmp = (int) rwlock_ptr->raw_lock.lock; 448+ } while (tmp < 0); 449+ if (tmp == 0) return(0); 450+ else return(RW_LOCK_BIAS-tmp); 451+} 452+ 453+/* 454+ * return true if rwlock is write locked 455+ * (note that other lock attempts can cause the lock value to be negative) 456+ */ 457+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock <= 0) 458+#define IABS(x) ((x) > 0 ? (x) : -(x)) 459+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((IABS((rwlock_ptr)->raw_lock.lock) % RW_LOCK_BIAS) != 0) 460+ 461+#define get_cycles64() get_cycles() 462+ 463+#endif /* _X8664_LOCKMETER_H */ 464Index: linux/include/linux/lockmeter.h 465=================================================================== 466--- /dev/null 1970-01-01 00:00:00.000000000 +0000 467+++ linux/include/linux/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 468@@ -0,0 +1,342 @@ 469+/* 470+ * Copyright (C) 1999-2002 Silicon Graphics, Inc. 471+ * 472+ * Written by John Hawkes (hawkes@sgi.com) 473+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 474+ * 475+ * Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000 476+ * Changes Copyright (C) 2000 IBM, Inc. 477+ * Added save of index in spinlock_t to improve efficiency 478+ * of "hold" time reporting for spinlocks 479+ * Added support for hold time statistics for read and write 480+ * locks. 481+ * Moved machine dependent code to include/asm/lockmeter.h. 482+ * 483+ */ 484+ 485+#ifndef _LINUX_LOCKMETER_H 486+#define _LINUX_LOCKMETER_H 487+ 488+#include <linux/utsname.h> 489+ 490+#ifdef CONFIG_LOCKMETER 491+ 492+/*--------------------------------------------------- 493+ * architecture-independent lockmeter.h 494+ *-------------------------------------------------*/ 495+ 496+/* 497+ * raybry -- version 2: added efficient hold time statistics 498+ * requires lstat recompile, so flagged as new version 499+ * raybry -- version 3: added global reader lock data 500+ * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port 501+ */ 502+#define LSTAT_VERSION 5 503+ 504+int lstat_update(void*, void*, int); 505+int lstat_update_time(void*, void*, int, uint32_t); 506+ 507+/* 508+ * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we 509+ * need to force compatibility in the inter-communication data structure. 510+ */ 511+ 512+#if defined(CONFIG_MIPS32_COMPAT) 513+#define TIME_T uint32_t 514+#elif defined(CONFIG_SPARC) || defined(CONFIG_SPARC64) 515+#define TIME_T uint64_t 516+#else 517+#define TIME_T time_t 518+#endif 519+ 520+#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC) && !defined(CONFIG_SPARC64)) || (_MIPS_SZLONG==32) 521+#define POINTER void * 522+#else 523+#define POINTER int64_t 524+#endif 525+ 526+/* 527+ * Values for the "action" parameter passed to lstat_update. 528+ * ZZZ - do we want a try-success status here??? 529+ */ 530+#define LSTAT_ACT_NO_WAIT 0 531+#define LSTAT_ACT_SPIN 1 532+#define LSTAT_ACT_REJECT 2 533+#define LSTAT_ACT_WW_SPIN 3 534+#define LSTAT_ACT_SLEPT 4 /* UNUSED */ 535+ 536+#define LSTAT_ACT_MAX_VALUES 4 /* NOTE: Increase to 5 if use ACT_SLEPT */ 537+ 538+/* 539+ * Special values for the low 2 bits of an RA passed to 540+ * lstat_update. 541+ */ 542+/* we use these values to figure out what kind of lock data */ 543+/* is stored in the statistics table entry at index ....... */ 544+#define LSTAT_RA_SPIN 0 /* spin lock data */ 545+#define LSTAT_RA_READ 1 /* read lock statistics */ 546+#define LSTAT_RA_SEMA 2 /* RESERVED */ 547+#define LSTAT_RA_WRITE 3 /* write lock statistics*/ 548+ 549+#define LSTAT_RA(n) \ 550+ ((void*)( ((unsigned long) caller_pc & ~3) | n) ) 551+ 552+/* 553+ * Constants used for lock addresses in the lstat_directory 554+ * to indicate special values of the lock address. 555+ */ 556+#define LSTAT_MULTI_LOCK_ADDRESS NULL 557+ 558+/* 559+ * Maximum size of the lockstats tables. Increase this value 560+ * if its not big enough. (Nothing bad happens if its not 561+ * big enough although some locks will not be monitored.) 562+ * We record overflows of this quantity in lstat_control.dir_overflows 563+ * 564+ * Note: The max value here must fit into the field set 565+ * and obtained by the macro's PUT_INDEX() and GET_INDEX(). 566+ * This value depends on how many bits are available in the 567+ * lock word in the particular machine implementation we are on. 568+ */ 569+#define LSTAT_MAX_STAT_INDEX 2000 570+ 571+/* 572+ * Size and mask for the hash table into the directory. 573+ */ 574+#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ 575+#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) 576+ 577+#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) 578+ 579+/* 580+ * This defines an entry in the lockstat directory. It contains 581+ * information about a lock being monitored. 582+ * A directory entry only contains the lock identification - 583+ * counts on usage of the lock are kept elsewhere in a per-cpu 584+ * data structure to minimize cache line pinging. 585+ */ 586+typedef struct { 587+ POINTER caller_ra; /* RA of code that set lock */ 588+ POINTER lock_ptr; /* lock address */ 589+ ushort next_stat_index; /* Used to link multiple locks that have the same hash table value */ 590+} lstat_directory_entry_t; 591+ 592+/* 593+ * A multi-dimensioned array used to contain counts for lock accesses. 594+ * The array is 3-dimensional: 595+ * - CPU number. Keep from thrashing cache lines between CPUs 596+ * - Directory entry index. Identifies the lock 597+ * - Action. Indicates what kind of contention occurred on an 598+ * access to the lock. 599+ * 600+ * The index of an entry in the directory is the same as the 2nd index 601+ * of the entry in the counts array. 602+ */ 603+/* 604+ * This table contains data for spin_locks, write locks, and read locks 605+ * Not all data is used for all cases. In particular, the hold time 606+ * information is not stored here for read locks since that is a global 607+ * (e. g. cannot be separated out by return address) quantity. 608+ * See the lstat_read_lock_counts_t structure for the global read lock 609+ * hold time. 610+ */ 611+typedef struct { 612+ uint64_t cum_wait_ticks; /* sum of wait times */ 613+ /* for write locks, sum of time a */ 614+ /* writer is waiting for a reader */ 615+ int64_t cum_hold_ticks; /* cumulative sum of holds */ 616+ /* not used for read mode locks */ 617+ /* must be signed. ............... */ 618+ uint32_t max_wait_ticks; /* max waiting time */ 619+ uint32_t max_hold_ticks; /* max holding time */ 620+ uint64_t cum_wait_ww_ticks; /* sum times writer waits on writer*/ 621+ uint32_t max_wait_ww_ticks; /* max wait time writer vs writer */ 622+ /* prev 2 only used for write locks*/ 623+ uint32_t acquire_time; /* time lock acquired this CPU */ 624+ uint32_t count[LSTAT_ACT_MAX_VALUES]; 625+} lstat_lock_counts_t; 626+ 627+typedef lstat_lock_counts_t lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX]; 628+ 629+/* 630+ * User request to: 631+ * - turn statistic collection on/off, or to reset 632+ */ 633+#define LSTAT_OFF 0 634+#define LSTAT_ON 1 635+#define LSTAT_RESET 2 636+#define LSTAT_RELEASE 3 637+ 638+#define LSTAT_MAX_READ_LOCK_INDEX 1000 639+typedef struct { 640+ POINTER lock_ptr; /* address of lock for output stats */ 641+ uint32_t read_lock_count; 642+ int64_t cum_hold_ticks; /* sum of read lock hold times over */ 643+ /* all callers. ....................*/ 644+ uint32_t write_index; /* last write lock hash table index */ 645+ uint32_t busy_periods; /* count of busy periods ended this */ 646+ uint64_t start_busy; /* time this busy period started. ..*/ 647+ uint64_t busy_ticks; /* sum of busy periods this lock. ..*/ 648+ uint64_t max_busy; /* longest busy period for this lock*/ 649+ uint32_t max_readers; /* maximum number of readers ...... */ 650+#ifdef USER_MODE_TESTING 651+ rwlock_t entry_lock; /* lock for this read lock entry... */ 652+ /* avoid having more than one rdr at*/ 653+ /* needed for user space testing... */ 654+ /* not needed for kernel 'cause it */ 655+ /* is non-preemptive. ............. */ 656+#endif 657+} lstat_read_lock_counts_t; 658+typedef lstat_read_lock_counts_t lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX]; 659+ 660+#if defined(__KERNEL__) || defined(USER_MODE_TESTING) 661+ 662+/* 663+ * macros to cache and retrieve an index value inside of a lock 664+ * these macros assume that there are less than 65536 simultaneous 665+ * (read mode) holders of a rwlock. 666+ * we also assume that the hash table has less than 32767 entries. 667+ */ 668+#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = indexv 669+#define GET_INDEX(lock_ptr) (lock_ptr)->index 670+ 671+#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = indexv 672+#define GET_RWINDEX(rwlock_ptr) (rwlock_ptr)->index 673+#define PUT_RW_CPU(rwlock_ptr,cpuv) (rwlock_ptr)->cpu = cpuv 674+#define GET_RW_CPU(rwlock_ptr) (rwlock_ptr)->cpu 675+ 676+#ifndef USER_MODE_TESTING 677+#include <asm/lockmeter.h> 678+#else 679+#include "asm_newlockmeter.h" 680+#endif 681+ 682+/* 683+ * Size and mask for the hash table into the directory. 684+ */ 685+#define LSTAT_HASH_TABLE_SIZE 4096 /* must be 2**N */ 686+#define LSTAT_HASH_TABLE_MASK (LSTAT_HASH_TABLE_SIZE-1) 687+ 688+#define DIRHASH(ra) ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK) 689+ 690+/* 691+ * This version eliminates the per processor lock stack. What we do is to 692+ * store the index of the lock hash structure in unused bits in the lock 693+ * itself. Then on unlock we can find the statistics record without doing 694+ * any additional hash or lock stack lookup. This works for spin_locks. 695+ * Hold time reporting is now basically as cheap as wait time reporting 696+ * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT 697+ * as in version 1.1.* of lockmeter. 698+ * 699+ * For rw_locks, we store the index of a global reader stats structure in 700+ * the lock and the writer index is stored in the latter structure. 701+ * For read mode locks we hash at the time of the lock to find an entry 702+ * in the directory for reader wait time and the like. 703+ * At unlock time for read mode locks, we update just the global structure 704+ * so we don't need to know the reader directory index value at unlock time. 705+ * 706+ */ 707+ 708+/* 709+ * Protocol to change lstat_control.state 710+ * This is complicated because we don't want the cum_hold_time for 711+ * a rw_lock to be decremented in _read_lock_ without making sure it 712+ * is incremented in _read_lock_ and vice versa. So here is the 713+ * way we change the state of lstat_control.state: 714+ * I. To Turn Statistics On 715+ * After allocating storage, set lstat_control.state non-zero. 716+ * This works because we don't start updating statistics for in use 717+ * locks until the reader lock count goes to zero. 718+ * II. To Turn Statistics Off: 719+ * (0) Disable interrupts on this CPU 720+ * (1) Seize the lstat_control.directory_lock 721+ * (2) Obtain the current value of lstat_control.next_free_read_lock_index 722+ * (3) Store a zero in lstat_control.state. 723+ * (4) Release the lstat_control.directory_lock 724+ * (5) For each lock in the read lock list up to the saved value 725+ * (well, -1) of the next_free_read_lock_index, do the following: 726+ * (a) Check validity of the stored lock address 727+ * by making sure that the word at the saved addr 728+ * has an index that matches this entry. If not 729+ * valid, then skip this entry. 730+ * (b) If there is a write lock already set on this lock, 731+ * skip to (d) below. 732+ * (c) Set a non-metered write lock on the lock 733+ * (d) set the cached INDEX in the lock to zero 734+ * (e) Release the non-metered write lock. 735+ * (6) Re-enable interrupts 736+ * 737+ * These rules ensure that a read lock will not have its statistics 738+ * partially updated even though the global lock recording state has 739+ * changed. See put_lockmeter_info() for implementation. 740+ * 741+ * The reason for (b) is that there may be write locks set on the 742+ * syscall path to put_lockmeter_info() from user space. If we do 743+ * not do this check, then we can deadlock. A similar problem would 744+ * occur if the lock was read locked by the current CPU. At the 745+ * moment this does not appear to happen. 746+ */ 747+ 748+/* 749+ * Main control structure for lockstat. Used to turn statistics on/off 750+ * and to maintain directory info. 751+ */ 752+typedef struct { 753+ int state; 754+ spinlock_t control_lock; /* used to serialize turning statistics on/off */ 755+ spinlock_t directory_lock; /* for serialize adding entries to directory */ 756+ volatile int next_free_dir_index;/* next free entry in the directory */ 757+ /* FIXME not all of these fields are used / needed .............. */ 758+ /* the following fields represent data since */ 759+ /* first "lstat on" or most recent "lstat reset" */ 760+ TIME_T first_started_time; /* time when measurement first enabled */ 761+ TIME_T started_time; /* time when measurement last started */ 762+ TIME_T ending_time; /* time when measurement last disabled */ 763+ uint64_t started_cycles64; /* cycles when measurement last started */ 764+ uint64_t ending_cycles64; /* cycles when measurement last disabled */ 765+ uint64_t enabled_cycles64; /* total cycles with measurement enabled */ 766+ int intervals; /* number of measurement intervals recorded */ 767+ /* i. e. number of times did lstat on;lstat off */ 768+ lstat_directory_entry_t *dir; /* directory */ 769+ int dir_overflow; /* count of times ran out of space in directory */ 770+ int rwlock_overflow; /* count of times we couldn't allocate a rw block*/ 771+ ushort *hashtab; /* hash table for quick dir scans */ 772+ lstat_cpu_counts_t *counts[NR_CPUS]; /* Array of pointers to per-cpu stats */ 773+ int next_free_read_lock_index; /* next rwlock reader (global) stats block */ 774+ lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats */ 775+} lstat_control_t; 776+ 777+#endif /* defined(__KERNEL__) || defined(USER_MODE_TESTING) */ 778+ 779+typedef struct { 780+ short lstat_version; /* version of the data */ 781+ short state; /* the current state is returned */ 782+ int maxcpus; /* Number of cpus present */ 783+ int next_free_dir_index; /* index of the next free directory entry */ 784+ TIME_T first_started_time; /* when measurement enabled for first time */ 785+ TIME_T started_time; /* time in secs since 1969 when stats last turned on */ 786+ TIME_T ending_time; /* time in secs since 1969 when stats last turned off */ 787+ uint32_t cycleval; /* cycles per second */ 788+#ifdef notyet 789+ void *kernel_magic_addr; /* address of kernel_magic */ 790+ void *kernel_end_addr; /* contents of kernel magic (points to "end") */ 791+#endif 792+ int next_free_read_lock_index; /* index of next (global) read lock stats struct */ 793+ uint64_t started_cycles64; /* cycles when measurement last started */ 794+ uint64_t ending_cycles64; /* cycles when stats last turned off */ 795+ uint64_t enabled_cycles64; /* total cycles with measurement enabled */ 796+ int intervals; /* number of measurement intervals recorded */ 797+ /* i.e. number of times we did lstat on;lstat off*/ 798+ int dir_overflow; /* number of times we wanted more space in directory */ 799+ int rwlock_overflow; /* # of times we wanted more space in read_locks_count */ 800+ struct new_utsname uts; /* info about machine where stats are measured */ 801+ /* -T option of lockstat allows data to be */ 802+ /* moved to another machine. ................. */ 803+} lstat_user_request_t; 804+ 805+#else 806+XXX fix: defines for _metered routines 807+ 808+#endif 809+ 810+#endif /* _LINUX_LOCKMETER_H */ 811Index: linux/kernel/lockmeter.c 812=================================================================== 813--- /dev/null 1970-01-01 00:00:00.000000000 +0000 814+++ linux/kernel/lockmeter.c 2005-11-18 14:59:31.000000000 -0800 815@@ -0,0 +1,1251 @@ 816+/* 817+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 818+ * 819+ * Written by John Hawkes (hawkes@sgi.com) 820+ * Based on klstat.c by Jack Steiner (steiner@sgi.com) 821+ * 822+ * Modified by Ray Bryant (raybry@us.ibm.com) 823+ * Changes Copyright (C) 2000 IBM, Inc. 824+ * Added save of index in spinlock_t to improve efficiency 825+ * of "hold" time reporting for spinlocks 826+ * Added support for hold time statistics for read and write 827+ * locks. 828+ * 829+ * Modified by Ray Bryant (raybry@sgi.com) 830+ * Changes Copyright (C) 2004, Silicon Graphics, Inc. 831+ * Fix to work with out-of-line spinlocks. 832+ */ 833+ 834+#include <linux/config.h> 835+#include <linux/linkage.h> 836+#include <linux/preempt.h> 837+#include <linux/interrupt.h> 838+#include <linux/module.h> 839+#include <linux/types.h> 840+#include <linux/errno.h> 841+#include <linux/slab.h> 842+#include <linux/sched.h> 843+#include <linux/smp.h> 844+#include <linux/threads.h> 845+#include <linux/version.h> 846+#include <linux/vmalloc.h> 847+#include <linux/spinlock.h> 848+#include <linux/utsname.h> 849+#include <linux/module.h> 850+#include <asm/system.h> 851+#include <asm/uaccess.h> 852+ 853+#include <linux/lockmeter.h> 854+ 855+#define ASSERT(cond) 856+#define bzero(loc,size) memset(loc,0,size) 857+ 858+/*<---------------------------------------------------*/ 859+/* lockmeter.c */ 860+/*>---------------------------------------------------*/ 861+ 862+static lstat_control_t lstat_control __cacheline_aligned = 863+ { LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED, 864+ 19 * 0, NR_CPUS * 0, 0, NR_CPUS * 0 }; 865+ 866+static ushort lstat_make_dir_entry(void *, void *); 867+ 868+/* 869+ * lstat_lookup 870+ * 871+ * Given a RA, locate the directory entry for the lock. 872+ */ 873+static ushort 874+lstat_lookup(void *lock_ptr, void *caller_ra) 875+{ 876+ ushort index; 877+ lstat_directory_entry_t *dirp; 878+ 879+ dirp = lstat_control.dir; 880+ 881+ index = lstat_control.hashtab[DIRHASH(caller_ra)]; 882+ while (dirp[index].caller_ra != caller_ra) { 883+ if (index == 0) { 884+ return lstat_make_dir_entry(lock_ptr, caller_ra); 885+ } 886+ index = dirp[index].next_stat_index; 887+ } 888+ 889+ if (dirp[index].lock_ptr != NULL && dirp[index].lock_ptr != lock_ptr) { 890+ dirp[index].lock_ptr = NULL; 891+ } 892+ 893+ return index; 894+} 895+ 896+/* 897+ * lstat_make_dir_entry 898+ * Called to add a new lock to the lock directory. 899+ */ 900+static ushort 901+lstat_make_dir_entry(void *lock_ptr, void *caller_ra) 902+{ 903+ lstat_directory_entry_t *dirp; 904+ ushort index, hindex; 905+ unsigned long flags; 906+ 907+ /* lock the table without recursively reentering this metering code */ 908+ local_irq_save(flags); 909+ _raw_spin_lock(&lstat_control.directory_lock); 910+ 911+ hindex = DIRHASH(caller_ra); 912+ index = lstat_control.hashtab[hindex]; 913+ dirp = lstat_control.dir; 914+ while (index && dirp[index].caller_ra != caller_ra) 915+ index = dirp[index].next_stat_index; 916+ 917+ if (index == 0) { 918+ if (lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) { 919+ index = lstat_control.next_free_dir_index++; 920+ lstat_control.dir[index].caller_ra = caller_ra; 921+ lstat_control.dir[index].lock_ptr = lock_ptr; 922+ lstat_control.dir[index].next_stat_index = 923+ lstat_control.hashtab[hindex]; 924+ lstat_control.hashtab[hindex] = index; 925+ } else { 926+ lstat_control.dir_overflow++; 927+ } 928+ } 929+ _raw_spin_unlock(&lstat_control.directory_lock); 930+ local_irq_restore(flags); 931+ return index; 932+} 933+ 934+int 935+lstat_update(void *lock_ptr, void *caller_ra, int action) 936+{ 937+ int index; 938+ int cpu; 939+ 940+ ASSERT(action < LSTAT_ACT_MAX_VALUES); 941+ 942+ if (lstat_control.state == LSTAT_OFF) 943+ return 0; 944+ 945+ index = lstat_lookup(lock_ptr, caller_ra); 946+ cpu = THIS_CPU_NUMBER; 947+ (*lstat_control.counts[cpu])[index].count[action]++; 948+ (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); 949+ 950+ return index; 951+} 952+ 953+int 954+lstat_update_time(void *lock_ptr, void *caller_ra, int action, uint32_t ticks) 955+{ 956+ ushort index; 957+ int cpu; 958+ 959+ ASSERT(action < LSTAT_ACT_MAX_VALUES); 960+ 961+ if (lstat_control.state == LSTAT_OFF) 962+ return 0; 963+ 964+ index = lstat_lookup(lock_ptr, caller_ra); 965+ cpu = THIS_CPU_NUMBER; 966+ (*lstat_control.counts[cpu])[index].count[action]++; 967+ (*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t) ticks; 968+ if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks) 969+ (*lstat_control.counts[cpu])[index].max_wait_ticks = ticks; 970+ 971+ (*lstat_control.counts[cpu])[index].acquire_time = get_cycles(); 972+ 973+ return index; 974+} 975+ 976+void 977+_metered_spin_lock(spinlock_t * lock_ptr, void *caller_pc) 978+{ 979+ if (lstat_control.state == LSTAT_OFF) { 980+ _raw_spin_lock(lock_ptr); /* do the real lock */ 981+ PUT_INDEX(lock_ptr, 0); /* clean index in case lockmetering */ 982+ /* gets turned on before unlock */ 983+ } else { 984+ void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); 985+ int index; 986+ 987+ if (_raw_spin_trylock(lock_ptr)) { 988+ index = lstat_update(lock_ptr, this_pc, 989+ LSTAT_ACT_NO_WAIT); 990+ } else { 991+ uint32_t start_cycles = get_cycles(); 992+ _raw_spin_lock(lock_ptr); /* do the real lock */ 993+ index = lstat_update_time(lock_ptr, this_pc, 994+ LSTAT_ACT_SPIN, get_cycles() - start_cycles); 995+ } 996+ /* save the index in the lock itself for use in spin unlock */ 997+ PUT_INDEX(lock_ptr, index); 998+ } 999+} 1000+/* some archs require this for atomic_dec_and_lock in modules */ 1001+EXPORT_SYMBOL(_metered_spin_lock); 1002+ 1003+void 1004+_metered_spin_lock_flags(spinlock_t * lock_ptr, unsigned long *flags, 1005+ void *caller_pc) 1006+{ 1007+ if (lstat_control.state == LSTAT_OFF) { 1008+ _raw_spin_lock_flags(lock_ptr, flags); /* do the real lock */ 1009+ PUT_INDEX(lock_ptr, 0); /* clean index in case lockmetering */ 1010+ /* gets turned on before unlock */ 1011+ } else { 1012+ void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); 1013+ int index; 1014+ 1015+ if (_raw_spin_trylock(lock_ptr)) { 1016+ index = lstat_update(lock_ptr, this_pc, 1017+ LSTAT_ACT_NO_WAIT); 1018+ } else { 1019+ uint32_t start_cycles = get_cycles(); 1020+ /* do the real lock */ 1021+ _raw_spin_lock_flags(lock_ptr, flags); 1022+ index = lstat_update_time(lock_ptr, this_pc, 1023+ LSTAT_ACT_SPIN, get_cycles() - start_cycles); 1024+ } 1025+ /* save the index in the lock itself for use in spin unlock */ 1026+ PUT_INDEX(lock_ptr, index); 1027+ } 1028+} 1029+ 1030+int 1031+_metered_spin_trylock(spinlock_t * lock_ptr, void *caller_pc) 1032+{ 1033+ if (lstat_control.state == LSTAT_OFF) { 1034+ return _raw_spin_trylock(lock_ptr); 1035+ } else { 1036+ int retval; 1037+ void *this_pc = LSTAT_RA(LSTAT_RA_SPIN); 1038+ 1039+ if ((retval = _raw_spin_trylock(lock_ptr))) { 1040+ int index = lstat_update(lock_ptr, this_pc, 1041+ LSTAT_ACT_NO_WAIT); 1042+ /* 1043+ * save the index in the lock itself for use in spin 1044+ * unlock 1045+ */ 1046+ PUT_INDEX(lock_ptr, index); 1047+ } else { 1048+ lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT); 1049+ } 1050+ 1051+ return retval; 1052+ } 1053+} 1054+ 1055+void 1056+_metered_spin_unlock(spinlock_t * lock_ptr) 1057+{ 1058+ int index = -1; 1059+ 1060+ if (lstat_control.state != LSTAT_OFF) { 1061+ index = GET_INDEX(lock_ptr); 1062+ /* 1063+ * If statistics were turned off when we set the lock, 1064+ * then the index can be zero. If that is the case, 1065+ * then collect no stats on this call. 1066+ */ 1067+ if (index > 0) { 1068+ uint32_t hold_time; 1069+ int cpu = THIS_CPU_NUMBER; 1070+ hold_time = get_cycles() - 1071+ (*lstat_control.counts[cpu])[index].acquire_time; 1072+ (*lstat_control.counts[cpu])[index].cum_hold_ticks += 1073+ (uint64_t) hold_time; 1074+ if ((*lstat_control.counts[cpu])[index].max_hold_ticks < 1075+ hold_time) 1076+ (*lstat_control.counts[cpu])[index]. 1077+ max_hold_ticks = hold_time; 1078+ } 1079+ } 1080+ 1081+ /* make sure we don't have a stale index value saved */ 1082+ PUT_INDEX(lock_ptr, 0); 1083+ _raw_spin_unlock(lock_ptr); /* do the real unlock */ 1084+} 1085+/* some archs require this for atomic_dec_and_lock in modules*/ 1086+EXPORT_SYMBOL(_metered_spin_unlock); 1087+ 1088+/* 1089+ * allocate the next global read lock structure and store its index 1090+ * in the rwlock at "lock_ptr". 1091+ */ 1092+uint32_t 1093+alloc_rwlock_struct(rwlock_t * rwlock_ptr) 1094+{ 1095+ int index; 1096+ unsigned long flags; 1097+ int cpu = THIS_CPU_NUMBER; 1098+ 1099+ /* If we've already overflowed, then do a quick exit */ 1100+ if (lstat_control.next_free_read_lock_index > 1101+ LSTAT_MAX_READ_LOCK_INDEX) { 1102+ lstat_control.rwlock_overflow++; 1103+ return 0; 1104+ } 1105+ 1106+ local_irq_save(flags); 1107+ _raw_spin_lock(&lstat_control.directory_lock); 1108+ 1109+ /* It is possible this changed while we were waiting for the directory_lock */ 1110+ if (lstat_control.state == LSTAT_OFF) { 1111+ index = 0; 1112+ goto unlock; 1113+ } 1114+ 1115+ /* It is possible someone else got here first and set the index */ 1116+ if ((index = GET_RWINDEX(rwlock_ptr)) == 0) { 1117+ /* 1118+ * we can't turn on read stats for this lock while there are 1119+ * readers (this would mess up the running hold time sum at 1120+ * unlock time) 1121+ */ 1122+ if (RWLOCK_READERS(rwlock_ptr) != 0) { 1123+ index = 0; 1124+ goto unlock; 1125+ } 1126+ 1127+ /* 1128+ * if stats are turned on after being off, we may need to 1129+ * return an old index from when the statistics were on last 1130+ * time. 1131+ */ 1132+ for (index = 1; index < lstat_control.next_free_read_lock_index; 1133+ index++) 1134+ if ((*lstat_control.read_lock_counts[cpu])[index]. 1135+ lock_ptr == rwlock_ptr) 1136+ goto put_index_and_unlock; 1137+ 1138+ /* allocate the next global read lock structure */ 1139+ if (lstat_control.next_free_read_lock_index >= 1140+ LSTAT_MAX_READ_LOCK_INDEX) { 1141+ lstat_control.rwlock_overflow++; 1142+ index = 0; 1143+ goto unlock; 1144+ } 1145+ index = lstat_control.next_free_read_lock_index++; 1146+ 1147+ /* 1148+ * initialize the global read stats data structure for each 1149+ * cpu 1150+ */ 1151+ for_each_online_cpu(cpu) { 1152+ (*lstat_control.read_lock_counts[cpu])[index].lock_ptr = 1153+ rwlock_ptr; 1154+ } 1155+put_index_and_unlock: 1156+ /* store the index for the read lock structure into the lock */ 1157+ PUT_RWINDEX(rwlock_ptr, index); 1158+ } 1159+ 1160+unlock: 1161+ _raw_spin_unlock(&lstat_control.directory_lock); 1162+ local_irq_restore(flags); 1163+ return index; 1164+} 1165+ 1166+void 1167+_metered_read_lock(rwlock_t * rwlock_ptr, void *caller_pc) 1168+{ 1169+ void *this_pc; 1170+ uint32_t start_cycles; 1171+ int index; 1172+ int cpu; 1173+ unsigned long flags; 1174+ int readers_before, readers_after; 1175+ uint64_t cycles64; 1176+ 1177+ if (lstat_control.state == LSTAT_OFF) { 1178+ _raw_read_lock(rwlock_ptr); 1179+ /* clean index in case lockmetering turns on before an unlock */ 1180+ PUT_RWINDEX(rwlock_ptr, 0); 1181+ return; 1182+ } 1183+ 1184+ this_pc = LSTAT_RA(LSTAT_RA_READ); 1185+ cpu = THIS_CPU_NUMBER; 1186+ index = GET_RWINDEX(rwlock_ptr); 1187+ 1188+ /* allocate the global stats entry for this lock, if needed */ 1189+ if (index == 0) 1190+ index = alloc_rwlock_struct(rwlock_ptr); 1191+ 1192+ readers_before = RWLOCK_READERS(rwlock_ptr); 1193+ if (_raw_read_trylock(rwlock_ptr)) { 1194+ /* 1195+ * We have decremented the lock to count a new reader, 1196+ * and have confirmed that no writer has it locked. 1197+ */ 1198+ /* update statistics if enabled */ 1199+ if (index > 0) { 1200+ local_irq_save(flags); 1201+ lstat_update((void *) rwlock_ptr, this_pc, 1202+ LSTAT_ACT_NO_WAIT); 1203+ /* preserve value of TSC so cum_hold_ticks and start_busy use same value */ 1204+ cycles64 = get_cycles64(); 1205+ (*lstat_control.read_lock_counts[cpu])[index]. 1206+ cum_hold_ticks -= cycles64; 1207+ 1208+ /* record time and cpu of start of busy period */ 1209+ /* this is not perfect (some race conditions are possible) */ 1210+ if (readers_before == 0) { 1211+ (*lstat_control.read_lock_counts[cpu])[index]. 1212+ start_busy = cycles64; 1213+ PUT_RW_CPU(rwlock_ptr, cpu); 1214+ } 1215+ readers_after = RWLOCK_READERS(rwlock_ptr); 1216+ if (readers_after > 1217+ (*lstat_control.read_lock_counts[cpu])[index]. 1218+ max_readers) 1219+ (*lstat_control.read_lock_counts[cpu])[index]. 1220+ max_readers = readers_after; 1221+ local_irq_restore(flags); 1222+ } 1223+ 1224+ return; 1225+ } 1226+ /* If we get here, then we could not quickly grab the read lock */ 1227+ 1228+ start_cycles = get_cycles(); /* start counting the wait time */ 1229+ 1230+ /* Now spin until read_lock is successful */ 1231+ _raw_read_lock(rwlock_ptr); 1232+ 1233+ lstat_update_time((void *) rwlock_ptr, this_pc, LSTAT_ACT_SPIN, 1234+ get_cycles() - start_cycles); 1235+ 1236+ /* update statistics if they are enabled for this lock */ 1237+ if (index > 0) { 1238+ local_irq_save(flags); 1239+ cycles64 = get_cycles64(); 1240+ (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -= 1241+ cycles64; 1242+ 1243+ /* this is not perfect (some race conditions are possible) */ 1244+ if (readers_before == 0) { 1245+ (*lstat_control.read_lock_counts[cpu])[index]. 1246+ start_busy = cycles64; 1247+ PUT_RW_CPU(rwlock_ptr, cpu); 1248+ } 1249+ readers_after = RWLOCK_READERS(rwlock_ptr); 1250+ if (readers_after > 1251+ (*lstat_control.read_lock_counts[cpu])[index].max_readers) 1252+ (*lstat_control.read_lock_counts[cpu])[index]. 1253+ max_readers = readers_after; 1254+ local_irq_restore(flags); 1255+ } 1256+} 1257+ 1258+void 1259+_metered_read_unlock(rwlock_t * rwlock_ptr) 1260+{ 1261+ int index; 1262+ int cpu; 1263+ unsigned long flags; 1264+ uint64_t busy_length; 1265+ uint64_t cycles64; 1266+ 1267+ if (lstat_control.state == LSTAT_OFF) { 1268+ _raw_read_unlock(rwlock_ptr); 1269+ return; 1270+ } 1271+ 1272+ index = GET_RWINDEX(rwlock_ptr); 1273+ cpu = THIS_CPU_NUMBER; 1274+ 1275+ if (index > 0) { 1276+ local_irq_save(flags); 1277+ /* 1278+ * preserve value of TSC so cum_hold_ticks and busy_ticks are 1279+ * consistent. 1280+ */ 1281+ cycles64 = get_cycles64(); 1282+ (*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks += 1283+ cycles64; 1284+ (*lstat_control.read_lock_counts[cpu])[index].read_lock_count++; 1285+ 1286+ /* 1287+ * once again, this is not perfect (some race conditions are 1288+ * possible) 1289+ */ 1290+ if (RWLOCK_READERS(rwlock_ptr) == 1) { 1291+ int cpu1 = GET_RW_CPU(rwlock_ptr); 1292+ uint64_t last_start_busy = 1293+ (*lstat_control.read_lock_counts[cpu1])[index]. 1294+ start_busy; 1295+ (*lstat_control.read_lock_counts[cpu])[index]. 1296+ busy_periods++; 1297+ if (cycles64 > last_start_busy) { 1298+ busy_length = cycles64 - last_start_busy; 1299+ (*lstat_control.read_lock_counts[cpu])[index]. 1300+ busy_ticks += busy_length; 1301+ if (busy_length > 1302+ (*lstat_control. 1303+ read_lock_counts[cpu])[index]. 1304+ max_busy) 1305+ (*lstat_control. 1306+ read_lock_counts[cpu])[index]. 1307+ max_busy = busy_length; 1308+ } 1309+ } 1310+ local_irq_restore(flags); 1311+ } 1312+ _raw_read_unlock(rwlock_ptr); 1313+} 1314+ 1315+void 1316+_metered_write_lock(rwlock_t * rwlock_ptr, void *caller_pc) 1317+{ 1318+ uint32_t start_cycles; 1319+ void *this_pc; 1320+ uint32_t spin_ticks = 0; /* in anticipation of a potential wait */ 1321+ int index; 1322+ int write_index = 0; 1323+ int cpu; 1324+ enum { 1325+ writer_writer_conflict, 1326+ writer_reader_conflict 1327+ } why_wait = writer_writer_conflict; 1328+ 1329+ if (lstat_control.state == LSTAT_OFF) { 1330+ _raw_write_lock(rwlock_ptr); 1331+ /* clean index in case lockmetering turns on before an unlock */ 1332+ PUT_RWINDEX(rwlock_ptr, 0); 1333+ return; 1334+ } 1335+ 1336+ this_pc = LSTAT_RA(LSTAT_RA_WRITE); 1337+ cpu = THIS_CPU_NUMBER; 1338+ index = GET_RWINDEX(rwlock_ptr); 1339+ 1340+ /* allocate the global stats entry for this lock, if needed */ 1341+ if (index == 0) { 1342+ index = alloc_rwlock_struct(rwlock_ptr); 1343+ } 1344+ 1345+ if (_raw_write_trylock(rwlock_ptr)) { 1346+ /* We acquired the lock on the first try */ 1347+ write_index = lstat_update((void *) rwlock_ptr, this_pc, 1348+ LSTAT_ACT_NO_WAIT); 1349+ /* save the write_index for use in unlock if stats enabled */ 1350+ if (index > 0) 1351+ (*lstat_control.read_lock_counts[cpu])[index]. 1352+ write_index = write_index; 1353+ return; 1354+ } 1355+ 1356+ /* If we get here, then we could not quickly grab the write lock */ 1357+ start_cycles = get_cycles(); /* start counting the wait time */ 1358+ 1359+ why_wait = RWLOCK_READERS(rwlock_ptr) ? 1360+ writer_reader_conflict : writer_writer_conflict; 1361+ 1362+ /* Now set the lock and wait for conflicts to disappear */ 1363+ _raw_write_lock(rwlock_ptr); 1364+ 1365+ spin_ticks = get_cycles() - start_cycles; 1366+ 1367+ /* update stats -- if enabled */ 1368+ if (index > 0 && spin_ticks) { 1369+ if (why_wait == writer_reader_conflict) { 1370+ /* waited due to a reader holding the lock */ 1371+ write_index = lstat_update_time((void *)rwlock_ptr, 1372+ this_pc, LSTAT_ACT_SPIN, spin_ticks); 1373+ } else { 1374+ /* 1375+ * waited due to another writer holding the lock 1376+ */ 1377+ write_index = lstat_update_time((void *)rwlock_ptr, 1378+ this_pc, LSTAT_ACT_WW_SPIN, spin_ticks); 1379+ (*lstat_control.counts[cpu])[write_index]. 1380+ cum_wait_ww_ticks += spin_ticks; 1381+ if (spin_ticks > 1382+ (*lstat_control.counts[cpu])[write_index]. 1383+ max_wait_ww_ticks) { 1384+ (*lstat_control.counts[cpu])[write_index]. 1385+ max_wait_ww_ticks = spin_ticks; 1386+ } 1387+ } 1388+ 1389+ /* save the directory index for use on write_unlock */ 1390+ (*lstat_control.read_lock_counts[cpu])[index]. 1391+ write_index = write_index; 1392+ } 1393+} 1394+ 1395+void 1396+_metered_write_unlock(rwlock_t * rwlock_ptr) 1397+{ 1398+ int index; 1399+ int cpu; 1400+ int write_index; 1401+ uint32_t hold_time; 1402+ 1403+ if (lstat_control.state == LSTAT_OFF) { 1404+ _raw_write_unlock(rwlock_ptr); 1405+ return; 1406+ } 1407+ 1408+ cpu = THIS_CPU_NUMBER; 1409+ index = GET_RWINDEX(rwlock_ptr); 1410+ 1411+ /* update statistics if stats enabled for this lock */ 1412+ if (index > 0) { 1413+ write_index = 1414+ (*lstat_control.read_lock_counts[cpu])[index].write_index; 1415+ 1416+ hold_time = get_cycles() - 1417+ (*lstat_control.counts[cpu])[write_index].acquire_time; 1418+ (*lstat_control.counts[cpu])[write_index].cum_hold_ticks += 1419+ (uint64_t) hold_time; 1420+ if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks < 1421+ hold_time) 1422+ (*lstat_control.counts[cpu])[write_index]. 1423+ max_hold_ticks = hold_time; 1424+ } 1425+ _raw_write_unlock(rwlock_ptr); 1426+} 1427+ 1428+int 1429+_metered_write_trylock(rwlock_t * rwlock_ptr, void *caller_pc) 1430+{ 1431+ int retval; 1432+ void *this_pc = LSTAT_RA(LSTAT_RA_WRITE); 1433+ 1434+ if ((retval = _raw_write_trylock(rwlock_ptr))) { 1435+ lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT); 1436+ } else { 1437+ lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT); 1438+ } 1439+ 1440+ return retval; 1441+} 1442+ 1443+static void 1444+init_control_space(void) 1445+{ 1446+ /* Set all control space pointers to null and indices to "empty" */ 1447+ int cpu; 1448+ 1449+ /* 1450+ * Access CPU_CYCLE_FREQUENCY at the outset, which in some 1451+ * architectures may trigger a runtime calculation that uses a 1452+ * spinlock. Let's do this before lockmetering is turned on. 1453+ */ 1454+ if (CPU_CYCLE_FREQUENCY == 0) 1455+ BUG(); 1456+ 1457+ lstat_control.hashtab = NULL; 1458+ lstat_control.dir = NULL; 1459+ for (cpu = 0; cpu < NR_CPUS; cpu++) { 1460+ lstat_control.counts[cpu] = NULL; 1461+ lstat_control.read_lock_counts[cpu] = NULL; 1462+ } 1463+} 1464+ 1465+static int 1466+reset_lstat_data(void) 1467+{ 1468+ int cpu, flags; 1469+ 1470+ flags = 0; 1471+ lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ 1472+ lstat_control.next_free_read_lock_index = 1; 1473+ lstat_control.dir_overflow = 0; 1474+ lstat_control.rwlock_overflow = 0; 1475+ 1476+ lstat_control.started_cycles64 = 0; 1477+ lstat_control.ending_cycles64 = 0; 1478+ lstat_control.enabled_cycles64 = 0; 1479+ lstat_control.first_started_time = 0; 1480+ lstat_control.started_time = 0; 1481+ lstat_control.ending_time = 0; 1482+ lstat_control.intervals = 0; 1483+ 1484+ /* 1485+ * paranoia -- in case someone does a "lockstat reset" before 1486+ * "lockstat on" 1487+ */ 1488+ if (lstat_control.hashtab) { 1489+ bzero(lstat_control.hashtab, 1490+ LSTAT_HASH_TABLE_SIZE * sizeof (short)); 1491+ bzero(lstat_control.dir, LSTAT_MAX_STAT_INDEX * 1492+ sizeof (lstat_directory_entry_t)); 1493+ 1494+ for_each_online_cpu(cpu) { 1495+ bzero(lstat_control.counts[cpu], 1496+ sizeof (lstat_cpu_counts_t)); 1497+ bzero(lstat_control.read_lock_counts[cpu], 1498+ sizeof (lstat_read_lock_cpu_counts_t)); 1499+ } 1500+ } 1501+#ifdef NOTDEF 1502+ _raw_spin_unlock(&lstat_control.directory_lock); 1503+ local_irq_restore(flags); 1504+#endif 1505+ return 1; 1506+} 1507+ 1508+static void 1509+release_control_space(void) 1510+{ 1511+ /* 1512+ * Called when either (1) allocation of kmem 1513+ * or (2) when user writes LSTAT_RELEASE to /pro/lockmeter. 1514+ * Assume that all pointers have been initialized to zero, 1515+ * i.e., nonzero pointers are valid addresses. 1516+ */ 1517+ int cpu; 1518+ 1519+ if (lstat_control.hashtab) { 1520+ kfree(lstat_control.hashtab); 1521+ lstat_control.hashtab = NULL; 1522+ } 1523+ 1524+ if (lstat_control.dir) { 1525+ vfree(lstat_control.dir); 1526+ lstat_control.dir = NULL; 1527+ } 1528+ 1529+ for (cpu = 0; cpu < NR_CPUS; cpu++) { 1530+ if (lstat_control.counts[cpu]) { 1531+ vfree(lstat_control.counts[cpu]); 1532+ lstat_control.counts[cpu] = NULL; 1533+ } 1534+ if (lstat_control.read_lock_counts[cpu]) { 1535+ kfree(lstat_control.read_lock_counts[cpu]); 1536+ lstat_control.read_lock_counts[cpu] = NULL; 1537+ } 1538+ } 1539+} 1540+ 1541+int 1542+get_lockmeter_info_size(void) 1543+{ 1544+ return sizeof (lstat_user_request_t) 1545+ + num_online_cpus() * sizeof (lstat_cpu_counts_t) 1546+ + num_online_cpus() * sizeof (lstat_read_lock_cpu_counts_t) 1547+ + (LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t)); 1548+} 1549+ 1550+ssize_t 1551+get_lockmeter_info(char *buffer, size_t max_len, loff_t * last_index) 1552+{ 1553+ lstat_user_request_t req; 1554+ struct timeval tv; 1555+ ssize_t next_ret_bcount; 1556+ ssize_t actual_ret_bcount = 0; 1557+ int cpu; 1558+ 1559+ *last_index = 0; /* a one-shot read */ 1560+ 1561+ req.lstat_version = LSTAT_VERSION; 1562+ req.state = lstat_control.state; 1563+ req.maxcpus = num_online_cpus(); 1564+ req.cycleval = CPU_CYCLE_FREQUENCY; 1565+#ifdef notyet 1566+ req.kernel_magic_addr = (void *) &_etext; 1567+ req.kernel_end_addr = (void *) &_etext; 1568+#endif 1569+ req.uts = system_utsname; 1570+ req.intervals = lstat_control.intervals; 1571+ 1572+ req.first_started_time = lstat_control.first_started_time; 1573+ req.started_time = lstat_control.started_time; 1574+ req.started_cycles64 = lstat_control.started_cycles64; 1575+ 1576+ req.next_free_dir_index = lstat_control.next_free_dir_index; 1577+ req.next_free_read_lock_index = lstat_control.next_free_read_lock_index; 1578+ req.dir_overflow = lstat_control.dir_overflow; 1579+ req.rwlock_overflow = lstat_control.rwlock_overflow; 1580+ 1581+ if (lstat_control.state == LSTAT_OFF) { 1582+ if (req.intervals == 0) { 1583+ /* mesasurement is off and no valid data present */ 1584+ next_ret_bcount = sizeof (lstat_user_request_t); 1585+ req.enabled_cycles64 = 0; 1586+ 1587+ if ((actual_ret_bcount + next_ret_bcount) > max_len) 1588+ return actual_ret_bcount; 1589+ 1590+ copy_to_user(buffer, (void *) &req, next_ret_bcount); 1591+ actual_ret_bcount += next_ret_bcount; 1592+ return actual_ret_bcount; 1593+ } else { 1594+ /* 1595+ * measurement is off but valid data present 1596+ * fetch time info from lstat_control 1597+ */ 1598+ req.ending_time = lstat_control.ending_time; 1599+ req.ending_cycles64 = lstat_control.ending_cycles64; 1600+ req.enabled_cycles64 = lstat_control.enabled_cycles64; 1601+ } 1602+ } else { 1603+ /* 1604+ * this must be a read while data active--use current time, 1605+ * etc 1606+ */ 1607+ do_gettimeofday(&tv); 1608+ req.ending_time = tv.tv_sec; 1609+ req.ending_cycles64 = get_cycles64(); 1610+ req.enabled_cycles64 = req.ending_cycles64 - 1611+ req.started_cycles64 + lstat_control.enabled_cycles64; 1612+ } 1613+ 1614+ next_ret_bcount = sizeof (lstat_user_request_t); 1615+ if ((actual_ret_bcount + next_ret_bcount) > max_len) 1616+ return actual_ret_bcount; 1617+ 1618+ copy_to_user(buffer, (void *) &req, next_ret_bcount); 1619+ actual_ret_bcount += next_ret_bcount; 1620+ 1621+ if (!lstat_control.counts[0]) /* not initialized? */ 1622+ return actual_ret_bcount; 1623+ 1624+ next_ret_bcount = sizeof (lstat_cpu_counts_t); 1625+ for_each_online_cpu(cpu) { 1626+ if ((actual_ret_bcount + next_ret_bcount) > max_len) 1627+ return actual_ret_bcount; /* leave early */ 1628+ copy_to_user(buffer + actual_ret_bcount, 1629+ lstat_control.counts[cpu], next_ret_bcount); 1630+ actual_ret_bcount += next_ret_bcount; 1631+ } 1632+ 1633+ next_ret_bcount = LSTAT_MAX_STAT_INDEX * 1634+ sizeof (lstat_directory_entry_t); 1635+ if (((actual_ret_bcount + next_ret_bcount) > max_len) 1636+ || !lstat_control.dir) 1637+ return actual_ret_bcount; /* leave early */ 1638+ 1639+ copy_to_user(buffer + actual_ret_bcount, lstat_control.dir, 1640+ next_ret_bcount); 1641+ actual_ret_bcount += next_ret_bcount; 1642+ 1643+ next_ret_bcount = sizeof (lstat_read_lock_cpu_counts_t); 1644+ for_each_online_cpu(cpu) { 1645+ if (actual_ret_bcount + next_ret_bcount > max_len) 1646+ return actual_ret_bcount; 1647+ copy_to_user(buffer + actual_ret_bcount, 1648+ lstat_control.read_lock_counts[cpu], 1649+ next_ret_bcount); 1650+ actual_ret_bcount += next_ret_bcount; 1651+ } 1652+ 1653+ return actual_ret_bcount; 1654+} 1655+ 1656+/* 1657+ * Writing to the /proc lockmeter node enables or disables metering. 1658+ * based upon the first byte of the "written" data. 1659+ * The following values are defined: 1660+ * LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement 1661+ * subsequent calls just turn on measurement 1662+ * LSTAT_OFF: turns off measurement 1663+ * LSTAT_RESET: resets statistics 1664+ * LSTAT_RELEASE: releases statistics storage 1665+ * 1666+ * This allows one to accumulate statistics over several lockstat runs: 1667+ * 1668+ * lockstat on 1669+ * lockstat off 1670+ * ...repeat above as desired... 1671+ * lockstat get 1672+ * ...now start a new set of measurements... 1673+ * lockstat reset 1674+ * lockstat on 1675+ * ... 1676+ * 1677+ */ 1678+ssize_t 1679+put_lockmeter_info(const char *buffer, size_t len) 1680+{ 1681+ int error = 0; 1682+ int dirsize, countsize, read_lock_countsize, hashsize; 1683+ int cpu; 1684+ char put_char; 1685+ int i, read_lock_blocks; 1686+ unsigned long flags; 1687+ rwlock_t *lock_ptr; 1688+ struct timeval tv; 1689+ 1690+ if (len <= 0) 1691+ return -EINVAL; 1692+ 1693+ _raw_spin_lock(&lstat_control.control_lock); 1694+ 1695+ get_user(put_char, buffer); 1696+ switch (put_char) { 1697+ 1698+ case LSTAT_OFF: 1699+ if (lstat_control.state != LSTAT_OFF) { 1700+ /* 1701+ * To avoid seeing read lock hold times in an 1702+ * inconsisent state, we have to follow this protocol 1703+ * to turn off statistics 1704+ */ 1705+ local_irq_save(flags); 1706+ /* 1707+ * getting this lock will stop any read lock block 1708+ * allocations 1709+ */ 1710+ _raw_spin_lock(&lstat_control.directory_lock); 1711+ /* 1712+ * keep any more read lock blocks from being 1713+ * allocated 1714+ */ 1715+ lstat_control.state = LSTAT_OFF; 1716+ /* record how may read lock blocks there are */ 1717+ read_lock_blocks = 1718+ lstat_control.next_free_read_lock_index; 1719+ _raw_spin_unlock(&lstat_control.directory_lock); 1720+ /* now go through the list of read locks */ 1721+ cpu = THIS_CPU_NUMBER; 1722+ for (i = 1; i < read_lock_blocks; i++) { 1723+ lock_ptr = 1724+ (*lstat_control.read_lock_counts[cpu])[i]. 1725+ lock_ptr; 1726+ /* is this saved lock address still valid? */ 1727+ if (GET_RWINDEX(lock_ptr) == i) { 1728+ /* 1729+ * lock address appears to still be 1730+ * valid because we only hold one lock 1731+ * at a time, this can't cause a 1732+ * deadlock unless this is a lock held 1733+ * as part of the current system call 1734+ * path. At the moment there 1735+ * are no READ mode locks held to get 1736+ * here from user space, so we solve 1737+ * this by skipping locks held in 1738+ * write mode. 1739+ */ 1740+ if (RWLOCK_IS_WRITE_LOCKED(lock_ptr)) { 1741+ PUT_RWINDEX(lock_ptr, 0); 1742+ continue; 1743+ } 1744+ /* 1745+ * now we know there are no read 1746+ * holders of this lock! stop 1747+ * statistics collection for this 1748+ * lock 1749+ */ 1750+ _raw_write_lock(lock_ptr); 1751+ PUT_RWINDEX(lock_ptr, 0); 1752+ _raw_write_unlock(lock_ptr); 1753+ } 1754+ /* 1755+ * it may still be possible for the hold time 1756+ * sum to be negative e.g. if a lock is 1757+ * reallocated while "busy" we will have to fix 1758+ * this up in the data reduction program. 1759+ */ 1760+ } 1761+ local_irq_restore(flags); 1762+ lstat_control.intervals++; 1763+ lstat_control.ending_cycles64 = get_cycles64(); 1764+ lstat_control.enabled_cycles64 += 1765+ lstat_control.ending_cycles64 - 1766+ lstat_control.started_cycles64; 1767+ do_gettimeofday(&tv); 1768+ lstat_control.ending_time = tv.tv_sec; 1769+ /* 1770+ * don't deallocate the structures -- we may do a 1771+ * lockstat on to add to the data that is already 1772+ * there. Use LSTAT_RELEASE to release storage 1773+ */ 1774+ } else { 1775+ error = -EBUSY; /* already OFF */ 1776+ } 1777+ break; 1778+ 1779+ case LSTAT_ON: 1780+ if (lstat_control.state == LSTAT_OFF) { 1781+#ifdef DEBUG_LOCKMETER 1782+ printk("put_lockmeter_info(cpu=%d): LSTAT_ON\n", 1783+ THIS_CPU_NUMBER); 1784+#endif 1785+ lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ 1786+ 1787+ dirsize = LSTAT_MAX_STAT_INDEX * 1788+ sizeof (lstat_directory_entry_t); 1789+ hashsize = 1790+ (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort); 1791+ countsize = sizeof (lstat_cpu_counts_t); 1792+ read_lock_countsize = 1793+ sizeof (lstat_read_lock_cpu_counts_t); 1794+#ifdef DEBUG_LOCKMETER 1795+ printk(" dirsize:%d", dirsize); 1796+ printk(" hashsize:%d", hashsize); 1797+ printk(" countsize:%d", countsize); 1798+ printk(" read_lock_countsize:%d\n", 1799+ read_lock_countsize); 1800+#endif 1801+#ifdef DEBUG_LOCKMETER 1802+ { 1803+ int secs; 1804+ unsigned long cycles; 1805+ uint64_t cycles64; 1806+ 1807+ do_gettimeofday(&tv); 1808+ secs = tv.tv_sec; 1809+ do { 1810+ do_gettimeofday(&tv); 1811+ } while (secs == tv.tv_sec); 1812+ cycles = get_cycles(); 1813+ cycles64 = get_cycles64(); 1814+ secs = tv.tv_sec; 1815+ do { 1816+ do_gettimeofday(&tv); 1817+ } while (secs == tv.tv_sec); 1818+ cycles = get_cycles() - cycles; 1819+ cycles64 = get_cycles64() - cycles64; 1820+ printk("lockmeter: cycleFrequency:%d " 1821+ "cycles:%d cycles64:%d\n", 1822+ CPU_CYCLE_FREQUENCY, cycles, cycles64); 1823+ } 1824+#endif 1825+ 1826+ /* 1827+ * if this is the first call, allocate storage and 1828+ * initialize 1829+ */ 1830+ if (!lstat_control.hashtab) { 1831+ 1832+ spin_lock_init(&lstat_control.directory_lock); 1833+ 1834+ /* guarantee all pointers at zero */ 1835+ init_control_space(); 1836+ 1837+ lstat_control.hashtab = 1838+ kmalloc(hashsize, GFP_KERNEL); 1839+ if (!lstat_control.hashtab) { 1840+ error = -ENOSPC; 1841+#ifdef DEBUG_LOCKMETER 1842+ printk("!!error kmalloc of hashtab\n"); 1843+#endif 1844+ } 1845+ lstat_control.dir = vmalloc(dirsize); 1846+ if (!lstat_control.dir) { 1847+ error = -ENOSPC; 1848+#ifdef DEBUG_LOCKMETER 1849+ printk("!!error kmalloc of dir\n"); 1850+#endif 1851+ } 1852+ 1853+ for_each_online_cpu(cpu) { 1854+ lstat_control.counts[cpu] = 1855+ vmalloc(countsize); 1856+ if (!lstat_control.counts[cpu]) { 1857+ error = -ENOSPC; 1858+#ifdef DEBUG_LOCKMETER 1859+ printk("!!error vmalloc of " 1860+ "counts[%d]\n", cpu); 1861+#endif 1862+ } 1863+ lstat_control.read_lock_counts[cpu] = 1864+ (lstat_read_lock_cpu_counts_t *) 1865+ kmalloc(read_lock_countsize, 1866+ GFP_KERNEL); 1867+ if (!lstat_control. 1868+ read_lock_counts[cpu]) { 1869+ error = -ENOSPC; 1870+#ifdef DEBUG_LOCKMETER 1871+ printk("!!error kmalloc of " 1872+ "read_lock_counts[%d]\n", 1873+ cpu); 1874+#endif 1875+ } 1876+ } 1877+ } 1878+ 1879+ if (error) { 1880+ /* 1881+ * One or more kmalloc failures -- free 1882+ * everything 1883+ */ 1884+ release_control_space(); 1885+ } else { 1886+ 1887+ if (!reset_lstat_data()) { 1888+ error = -EINVAL; 1889+ break; 1890+ }; 1891+ 1892+ /* 1893+ * record starting and ending times and the 1894+ * like 1895+ */ 1896+ if (lstat_control.intervals == 0) { 1897+ do_gettimeofday(&tv); 1898+ lstat_control.first_started_time = 1899+ tv.tv_sec; 1900+ } 1901+ lstat_control.started_cycles64 = get_cycles64(); 1902+ do_gettimeofday(&tv); 1903+ lstat_control.started_time = tv.tv_sec; 1904+ 1905+ lstat_control.state = LSTAT_ON; 1906+ } 1907+ } else { 1908+ error = -EBUSY; /* already ON */ 1909+ } 1910+ break; 1911+ 1912+ case LSTAT_RESET: 1913+ if (lstat_control.state == LSTAT_OFF) { 1914+ if (!reset_lstat_data()) 1915+ error = -EINVAL; 1916+ } else { 1917+ error = -EBUSY; /* still on; can't reset */ 1918+ } 1919+ break; 1920+ 1921+ case LSTAT_RELEASE: 1922+ if (lstat_control.state == LSTAT_OFF) { 1923+ release_control_space(); 1924+ lstat_control.intervals = 0; 1925+ lstat_control.enabled_cycles64 = 0; 1926+ } else { 1927+ error = -EBUSY; 1928+ } 1929+ break; 1930+ 1931+ default: 1932+ error = -EINVAL; 1933+ } /* switch */ 1934+ 1935+ _raw_spin_unlock(&lstat_control.control_lock); 1936+ return error ? error : len; 1937+} 1938+ 1939+#ifdef USER_MODE_TESTING 1940+/* following used for user mode testing */ 1941+void 1942+lockmeter_init() 1943+{ 1944+ int dirsize, hashsize, countsize, read_lock_countsize, cpu; 1945+ 1946+ printf("lstat_control is at %x size=%d\n", &lstat_control, 1947+ sizeof (lstat_control)); 1948+ printf("sizeof(spinlock_t)=%d\n", sizeof (spinlock_t)); 1949+ lstat_control.state = LSTAT_ON; 1950+ 1951+ lstat_control.directory_lock = SPIN_LOCK_UNLOCKED; 1952+ lstat_control.next_free_dir_index = 1; /* 0 is for overflows */ 1953+ lstat_control.next_free_read_lock_index = 1; 1954+ 1955+ dirsize = LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t); 1956+ hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort); 1957+ countsize = sizeof (lstat_cpu_counts_t); 1958+ read_lock_countsize = sizeof (lstat_read_lock_cpu_counts_t); 1959+ 1960+ lstat_control.hashtab = (ushort *) malloc(hashsize); 1961+ 1962+ if (lstat_control.hashtab == 0) { 1963+ printf("malloc failure for at line %d in lockmeter.c\n", 1964+ __LINE__); 1965+ exit(0); 1966+ } 1967+ 1968+ lstat_control.dir = (lstat_directory_entry_t *) malloc(dirsize); 1969+ 1970+ if (lstat_control.dir == 0) { 1971+ printf("malloc failure for at line %d in lockmeter.c\n", cpu, 1972+ __LINE__); 1973+ exit(0); 1974+ } 1975+ 1976+ for_each_online_cpu(cpu) { 1977+ int j, k; 1978+ j = (int) (lstat_control.counts[cpu] = 1979+ (lstat_cpu_counts_t *) malloc(countsize)); 1980+ k = (int) (lstat_control.read_lock_counts[cpu] = 1981+ (lstat_read_lock_cpu_counts_t *) 1982+ malloc(read_lock_countsize)); 1983+ if (j * k == 0) { 1984+ printf("malloc failure for cpu=%d at line %d in " 1985+ "lockmeter.c\n", cpu, __LINE__); 1986+ exit(0); 1987+ } 1988+ } 1989+ 1990+ memset(lstat_control.hashtab, 0, hashsize); 1991+ memset(lstat_control.dir, 0, dirsize); 1992+ 1993+ for_each_online_cpu(cpu) { 1994+ memset(lstat_control.counts[cpu], 0, countsize); 1995+ memset(lstat_control.read_lock_counts[cpu], 0, 1996+ read_lock_countsize); 1997+ } 1998+} 1999+#endif 2000+ 2001+#ifdef CONFIG_PREEMPT 2002+xxx huh?? 2003+#endif 2004+#if defined(CONFIG_PREEMPT) 2005+xxx huh?? 2006+#endif 2007+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) 2008+xxx huh?? 2009+#endif 2010+ 2011+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) 2012+xxxx lockmeter cannot support CONFIG_PREEMPT now 2013+/* 2014+ * This could be a long-held lock. If another CPU holds it for a long time, 2015+ * and that CPU is not asked to reschedule then *this* CPU will spin on the 2016+ * lock for a long time, even if *this* CPU is asked to reschedule. 2017+ * 2018+ * So what we do here, in the slow (contended) path is to spin on the lock by 2019+ * hand while permitting preemption. 2020+ * 2021+ * Called inside preempt_disable(). 2022+ */ 2023+static inline void __preempt_spin_lock(spinlock_t *lock, void *caller_pc) 2024+{ 2025+ if (preempt_count() > 1) { 2026+ _metered_spin_lock(lock, caller_pc); 2027+ return; 2028+ } 2029+ 2030+ do { 2031+ preempt_enable(); 2032+ while (spin_is_locked(lock)) 2033+ cpu_relax(); 2034+ preempt_disable(); 2035+ } while (!_metered_spin_trylock(lock, caller_pc)); 2036+} 2037+ 2038+void __lockfunc _spin_lock(spinlock_t *lock) 2039+{ 2040+ preempt_disable(); 2041+ if (unlikely(!_metered_spin_trylock(lock, __builtin_return_address(0)))) 2042+ __preempt_spin_lock(lock, __builtin_return_address(0)); 2043+} 2044+ 2045+static inline void __preempt_write_lock(rwlock_t *lock, void *caller_pc) 2046+{ 2047+ if (preempt_count() > 1) { 2048+ _metered_write_lock(lock, caller_pc); 2049+ return; 2050+ } 2051+ 2052+ do { 2053+ preempt_enable(); 2054+ while (rwlock_is_locked(lock)) 2055+ cpu_relax(); 2056+ preempt_disable(); 2057+ } while (!_metered_write_trylock(lock,caller_pc)); 2058+} 2059+ 2060+void __lockfunc _write_lock(rwlock_t *lock) 2061+{ 2062+ preempt_disable(); 2063+ if (unlikely(!_metered_write_trylock(lock, __builtin_return_address(0)))) 2064+ __preempt_write_lock(lock, __builtin_return_address(0)); 2065+} 2066+#endif 2067Index: linux/arch/sparc64/Kconfig.debug 2068=================================================================== 2069--- linux.orig/arch/sparc64/Kconfig.debug 2005-11-18 14:58:48.000000000 -0800 2070+++ linux/arch/sparc64/Kconfig.debug 2005-11-18 14:59:31.000000000 -0800 2071@@ -41,6 +41,13 @@ 2072 This results in a large slowdown, but helps to find certain types 2073 of memory corruptions. 2074 2075+config LOCKMETER 2076+ bool "Kernel lock metering" 2077+ depends on SMP && !PREEMPT 2078+ help 2079+ Say Y to enable kernel lock metering, which adds overhead to SMP locks, 2080+ but allows you to see various statistics using the lockstat command. 2081+ 2082 config MCOUNT 2083 bool 2084 depends on STACK_DEBUG 2085Index: linux/arch/x86_64/Kconfig.debug 2086=================================================================== 2087--- linux.orig/arch/x86_64/Kconfig.debug 2005-11-18 14:58:48.000000000 -0800 2088+++ linux/arch/x86_64/Kconfig.debug 2005-11-18 14:59:31.000000000 -0800 2089@@ -54,4 +54,11 @@ 2090 #config X86_REMOTE_DEBUG 2091 # bool "kgdb debugging stub" 2092 2093+config LOCKMETER 2094+ bool "Kernel lock metering" 2095+ depends on SMP && !PREEMPT 2096+ help 2097+ Say Y to enable kernel lock metering, which adds overhead to SMP locks, 2098+ but allows you to see various statistics using the lockstat command. 2099+ 2100 endmenu 2101Index: linux/kernel/Makefile 2102=================================================================== 2103--- linux.orig/kernel/Makefile 2005-11-18 14:58:48.000000000 -0800 2104+++ linux/kernel/Makefile 2005-11-18 14:59:31.000000000 -0800 2105@@ -13,6 +13,7 @@ 2106 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 2107 obj-$(CONFIG_SMP) += cpu.o spinlock.o 2108 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 2109+obj-$(CONFIG_LOCKMETER) += lockmeter.o 2110 obj-$(CONFIG_UID16) += uid16.o 2111 obj-$(CONFIG_MODULES) += module.o 2112 obj-$(CONFIG_KALLSYMS) += kallsyms.o 2113Index: linux/include/linux/spinlock_types.h 2114=================================================================== 2115--- linux.orig/include/linux/spinlock_types.h 2005-11-18 14:58:48.000000000 -0800 2116+++ linux/include/linux/spinlock_types.h 2005-11-18 14:59:31.000000000 -0800 2117@@ -17,6 +17,11 @@ 2118 2119 typedef struct { 2120 raw_spinlock_t raw_lock; 2121+#if defined(CONFIG_LOCKMETER) 2122+ /* required for LOCKMETER since all bits in lock */ 2123+ /* are used and we need this storage for lock INDEX */ 2124+ volatile unsigned int index; 2125+#endif 2126 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 2127 unsigned int break_lock; 2128 #endif 2129@@ -33,6 +38,10 @@ 2130 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 2131 unsigned int break_lock; 2132 #endif 2133+#ifdef CONFIG_LOCKMETER 2134+ volatile unsigned short index; 2135+ volatile unsigned short cpu; 2136+#endif 2137 #ifdef CONFIG_DEBUG_SPINLOCK 2138 unsigned int magic, owner_cpu; 2139 void *owner; 2140@@ -55,13 +64,64 @@ 2141 .owner = SPINLOCK_OWNER_INIT, \ 2142 .owner_cpu = -1 } 2143 #else 2144+#ifdef CONFIG_LOCKMETER 2145+# define SPIN_LOCK_UNLOCKED \ 2146+ (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \ 2147+ .index = 0 } 2148+#define RW_LOCK_UNLOCKED \ 2149+ (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \ 2150+ .cpu = 0, \ 2151+ .index = 0 } 2152+#else 2153 # define SPIN_LOCK_UNLOCKED \ 2154 (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED } 2155 #define RW_LOCK_UNLOCKED \ 2156 (rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED } 2157 #endif 2158+#endif 2159 2160 #define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED 2161 #define DEFINE_RWLOCK(x) rwlock_t x = RW_LOCK_UNLOCKED 2162 2163+#ifdef CONFIG_LOCKMETER 2164+extern void _metered_spin_lock(spinlock_t *, void *); 2165+extern int _metered_spin_trylock(spinlock_t *, void *); 2166+extern void _metered_spin_lock_flags(spinlock_t *, unsigned long *, void *); 2167+extern int _metered_write_trylock(rwlock_t *, void *); 2168+extern void _metered_read_lock(rwlock_t *, void *); 2169+extern void _metered_write_lock(rwlock_t *, void *); 2170+extern void _metered_spin_unlock(spinlock_t *); 2171+extern void _metered_read_unlock(rwlock_t *); 2172+extern void _metered_write_unlock(rwlock_t *); 2173+ 2174+#define _do_raw_spin_lock(lock) \ 2175+ _metered_spin_lock(lock, __builtin_return_address(0)) 2176+#define _do_raw_spin_lock_flags(lock, flagsp) \ 2177+ _metered_spin_lock_flags(lock, flagsp, __builtin_return_address(0)) 2178+#define _do_raw_spin_trylock(lock) \ 2179+ _metered_spin_trylock(lock, __builtin_return_address(0)) 2180+#define _do_raw_read_trylock(lock) _raw_read_trylock(lock) 2181+#define _do_raw_write_trylock(lock) \ 2182+ _metered_write_trylock(lock, __builtin_return_address(0)) 2183+#define _do_raw_read_lock(lock) \ 2184+ _metered_read_lock(lock, __builtin_return_address(0)) 2185+#define _do_raw_write_lock(lock) \ 2186+ _metered_write_lock(lock, __builtin_return_address(0)) 2187+#define _do_raw_spin_unlock(lock) _metered_spin_unlock(lock) 2188+#define _do_raw_read_unlock(lock) _metered_read_unlock(lock) 2189+#define _do_raw_write_unlock(lock) _metered_write_unlock(lock) 2190+#else 2191+#define _do_raw_spin_lock(lock) _raw_spin_lock(lock) 2192+#define _do_raw_spin_lock_flags(lock,flagsp) \ 2193+ _raw_spin_lock_flags(lock, flagsp) 2194+#define _do_raw_spin_trylock(lock) _raw_spin_trylock(lock) 2195+#define _do_raw_read_trylock(lock) _raw_read_trylock(lock) 2196+#define _do_raw_write_trylock(lock) _raw_write_trylock(lock) 2197+#define _do_raw_read_lock(lock) _raw_read_lock(lock) 2198+#define _do_raw_write_lock(lock) _raw_write_lock(lock) 2199+#define _do_raw_spin_unlock(lock) _raw_spin_unlock(lock) 2200+#define _do_raw_read_unlock(lock) _raw_read_unlock(lock) 2201+#define _do_raw_write_unlock(lock) _raw_write_unlock(lock) 2202+#endif 2203+ 2204 #endif /* __LINUX_SPINLOCK_TYPES_H */ 2205Index: linux/kernel/spinlock.c 2206=================================================================== 2207--- linux.orig/kernel/spinlock.c 2005-11-18 14:58:48.000000000 -0800 2208+++ linux/kernel/spinlock.c 2005-11-18 14:59:31.000000000 -0800 2209@@ -30,7 +30,7 @@ 2210 int __lockfunc _spin_trylock(spinlock_t *lock) 2211 { 2212 preempt_disable(); 2213- if (_raw_spin_trylock(lock)) 2214+ if (_do_raw_spin_trylock(lock)) 2215 return 1; 2216 2217 preempt_enable(); 2218@@ -41,7 +41,7 @@ 2219 int __lockfunc _read_trylock(rwlock_t *lock) 2220 { 2221 preempt_disable(); 2222- if (_raw_read_trylock(lock)) 2223+ if (_do_raw_read_trylock(lock)) 2224 return 1; 2225 2226 preempt_enable(); 2227@@ -52,7 +52,7 @@ 2228 int __lockfunc _write_trylock(rwlock_t *lock) 2229 { 2230 preempt_disable(); 2231- if (_raw_write_trylock(lock)) 2232+ if (_do_raw_write_trylock(lock)) 2233 return 1; 2234 2235 preempt_enable(); 2236@@ -65,7 +65,7 @@ 2237 void __lockfunc _read_lock(rwlock_t *lock) 2238 { 2239 preempt_disable(); 2240- _raw_read_lock(lock); 2241+ _do_raw_read_lock(lock); 2242 } 2243 EXPORT_SYMBOL(_read_lock); 2244 2245@@ -75,7 +75,7 @@ 2246 2247 local_irq_save(flags); 2248 preempt_disable(); 2249- _raw_spin_lock_flags(lock, &flags); 2250+ _do_raw_spin_lock_flags(lock, &flags); 2251 return flags; 2252 } 2253 EXPORT_SYMBOL(_spin_lock_irqsave); 2254@@ -84,7 +84,7 @@ 2255 { 2256 local_irq_disable(); 2257 preempt_disable(); 2258- _raw_spin_lock(lock); 2259+ _do_raw_spin_lock(lock); 2260 } 2261 EXPORT_SYMBOL(_spin_lock_irq); 2262 2263@@ -92,7 +92,7 @@ 2264 { 2265 local_bh_disable(); 2266 preempt_disable(); 2267- _raw_spin_lock(lock); 2268+ _do_raw_spin_lock(lock); 2269 } 2270 EXPORT_SYMBOL(_spin_lock_bh); 2271 2272@@ -102,7 +102,7 @@ 2273 2274 local_irq_save(flags); 2275 preempt_disable(); 2276- _raw_read_lock(lock); 2277+ _do_raw_read_lock(lock); 2278 return flags; 2279 } 2280 EXPORT_SYMBOL(_read_lock_irqsave); 2281@@ -111,7 +111,7 @@ 2282 { 2283 local_irq_disable(); 2284 preempt_disable(); 2285- _raw_read_lock(lock); 2286+ _do_raw_read_lock(lock); 2287 } 2288 EXPORT_SYMBOL(_read_lock_irq); 2289 2290@@ -119,7 +119,7 @@ 2291 { 2292 local_bh_disable(); 2293 preempt_disable(); 2294- _raw_read_lock(lock); 2295+ _do_raw_read_lock(lock); 2296 } 2297 EXPORT_SYMBOL(_read_lock_bh); 2298 2299@@ -129,7 +129,7 @@ 2300 2301 local_irq_save(flags); 2302 preempt_disable(); 2303- _raw_write_lock(lock); 2304+ _do_raw_write_lock(lock); 2305 return flags; 2306 } 2307 EXPORT_SYMBOL(_write_lock_irqsave); 2308@@ -138,7 +138,7 @@ 2309 { 2310 local_irq_disable(); 2311 preempt_disable(); 2312- _raw_write_lock(lock); 2313+ _do_raw_write_lock(lock); 2314 } 2315 EXPORT_SYMBOL(_write_lock_irq); 2316 2317@@ -146,14 +146,14 @@ 2318 { 2319 local_bh_disable(); 2320 preempt_disable(); 2321- _raw_write_lock(lock); 2322+ _do_raw_write_lock(lock); 2323 } 2324 EXPORT_SYMBOL(_write_lock_bh); 2325 2326 void __lockfunc _spin_lock(spinlock_t *lock) 2327 { 2328 preempt_disable(); 2329- _raw_spin_lock(lock); 2330+ _do_raw_spin_lock(lock); 2331 } 2332 2333 EXPORT_SYMBOL(_spin_lock); 2334@@ -161,7 +161,7 @@ 2335 void __lockfunc _write_lock(rwlock_t *lock) 2336 { 2337 preempt_disable(); 2338- _raw_write_lock(lock); 2339+ _do_raw_write_lock(lock); 2340 } 2341 2342 EXPORT_SYMBOL(_write_lock); 2343@@ -259,28 +259,28 @@ 2344 2345 void __lockfunc _spin_unlock(spinlock_t *lock) 2346 { 2347- _raw_spin_unlock(lock); 2348+ _do_raw_spin_unlock(lock); 2349 preempt_enable(); 2350 } 2351 EXPORT_SYMBOL(_spin_unlock); 2352 2353 void __lockfunc _write_unlock(rwlock_t *lock) 2354 { 2355- _raw_write_unlock(lock); 2356+ _do_raw_write_unlock(lock); 2357 preempt_enable(); 2358 } 2359 EXPORT_SYMBOL(_write_unlock); 2360 2361 void __lockfunc _read_unlock(rwlock_t *lock) 2362 { 2363- _raw_read_unlock(lock); 2364+ _do_raw_read_unlock(lock); 2365 preempt_enable(); 2366 } 2367 EXPORT_SYMBOL(_read_unlock); 2368 2369 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 2370 { 2371- _raw_spin_unlock(lock); 2372+ _do_raw_spin_unlock(lock); 2373 local_irq_restore(flags); 2374 preempt_enable(); 2375 } 2376@@ -288,7 +288,7 @@ 2377 2378 void __lockfunc _spin_unlock_irq(spinlock_t *lock) 2379 { 2380- _raw_spin_unlock(lock); 2381+ _do_raw_spin_unlock(lock); 2382 local_irq_enable(); 2383 preempt_enable(); 2384 } 2385@@ -296,7 +296,7 @@ 2386 2387 void __lockfunc _spin_unlock_bh(spinlock_t *lock) 2388 { 2389- _raw_spin_unlock(lock); 2390+ _do_raw_spin_unlock(lock); 2391 preempt_enable_no_resched(); 2392 local_bh_enable(); 2393 } 2394@@ -304,7 +304,7 @@ 2395 2396 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 2397 { 2398- _raw_read_unlock(lock); 2399+ _do_raw_read_unlock(lock); 2400 local_irq_restore(flags); 2401 preempt_enable(); 2402 } 2403@@ -312,7 +312,7 @@ 2404 2405 void __lockfunc _read_unlock_irq(rwlock_t *lock) 2406 { 2407- _raw_read_unlock(lock); 2408+ _do_raw_read_unlock(lock); 2409 local_irq_enable(); 2410 preempt_enable(); 2411 } 2412@@ -320,7 +320,7 @@ 2413 2414 void __lockfunc _read_unlock_bh(rwlock_t *lock) 2415 { 2416- _raw_read_unlock(lock); 2417+ _do_raw_read_unlock(lock); 2418 preempt_enable_no_resched(); 2419 local_bh_enable(); 2420 } 2421@@ -328,7 +328,7 @@ 2422 2423 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 2424 { 2425- _raw_write_unlock(lock); 2426+ _do_raw_write_unlock(lock); 2427 local_irq_restore(flags); 2428 preempt_enable(); 2429 } 2430@@ -336,7 +336,7 @@ 2431 2432 void __lockfunc _write_unlock_irq(rwlock_t *lock) 2433 { 2434- _raw_write_unlock(lock); 2435+ _do_raw_write_unlock(lock); 2436 local_irq_enable(); 2437 preempt_enable(); 2438 } 2439@@ -344,7 +344,7 @@ 2440 2441 void __lockfunc _write_unlock_bh(rwlock_t *lock) 2442 { 2443- _raw_write_unlock(lock); 2444+ _do_raw_write_unlock(lock); 2445 preempt_enable_no_resched(); 2446 local_bh_enable(); 2447 } 2448@@ -354,7 +354,7 @@ 2449 { 2450 local_bh_disable(); 2451 preempt_disable(); 2452- if (_raw_spin_trylock(lock)) 2453+ if (_do_raw_spin_trylock(lock)) 2454 return 1; 2455 2456 preempt_enable_no_resched(); 2457Index: linux/arch/ppc64/Kconfig.debug 2458=================================================================== 2459--- linux.orig/arch/ppc64/Kconfig.debug 2005-11-18 14:58:48.000000000 -0800 2460+++ linux/arch/ppc64/Kconfig.debug 2005-11-18 14:59:31.000000000 -0800 2461@@ -19,6 +19,13 @@ 2462 for kernel debugging, non-intrusive instrumentation and testing. 2463 If in doubt, say "N". 2464 2465+config LOCKMETER 2466+ bool "Kernel lock metering" 2467+ depends on SMP 2468+ help 2469+ Say Y to enable kernel lock metering, which adds overhead to SMP locks, 2470+ but allows you to see various statistics using the lockstat command. 2471+ 2472 config DEBUG_STACK_USAGE 2473 bool "Stack utilization instrumentation" 2474 depends on DEBUG_KERNEL 2475Index: linux/include/asm-ppc64/lockmeter.h 2476=================================================================== 2477--- /dev/null 1970-01-01 00:00:00.000000000 +0000 2478+++ linux/include/asm-ppc64/lockmeter.h 2005-11-18 14:59:31.000000000 -0800 2479@@ -0,0 +1,71 @@ 2480+/* 2481+ * Copyright (C) 1999,2000 Silicon Graphics, Inc. 2482+ * 2483+ * Written by John Hawkes (hawkes@sgi.com) 2484+ * Based on klstat.h by Jack Steiner (steiner@sgi.com) 2485+ * 2486+ * Modified by Ray Bryant (raybry@us.ibm.com) 2487+ * Changes Copyright (C) 2000 IBM, Inc. 2488+ * Added save of index in spinlock_t to improve efficiency 2489+ * of "hold" time reporting for spinlocks. 2490+ * Added support for hold time statistics for read and write 2491+ * locks. 2492+ * Moved machine dependent code here from include/lockmeter.h. 2493+ * 2494+ * Modified by Tony Garcia (garcia1@us.ibm.com) 2495+ * Ported to Power PC 64 2496+ */ 2497+ 2498+#ifndef _PPC64_LOCKMETER_H 2499+#define _PPC64_LOCKMETER_H 2500+ 2501+ 2502+#include <asm/spinlock.h> 2503+#include <linux/version.h> 2504+#include <linux/cpufreq.h> 2505+ 2506+#include <asm/processor.h> /* definitions for SPRN_TBRL 2507+ SPRN_TBRU, mftb() */ 2508+extern unsigned long ppc_proc_freq; 2509+ 2510+#define CPU_CYCLE_FREQUENCY ppc_proc_freq 2511+ 2512+#define THIS_CPU_NUMBER smp_processor_id() 2513+ 2514+/* 2515+ * return the number of readers for a rwlock_t 2516+ */ 2517+#define RWLOCK_READERS(rwlock_ptr) rwlock_readers(rwlock_ptr) 2518+ 2519+/* Return number of readers */ 2520+extern inline int rwlock_readers(rwlock_t *rwlock_ptr) 2521+{ 2522+ signed int tmp = rwlock_ptr->lock; 2523+ 2524+ if ( tmp > 0 ) 2525+ return tmp; 2526+ else 2527+ return 0; 2528+} 2529+ 2530+/* 2531+ * return true if rwlock is write locked 2532+ * (note that other lock attempts can cause the lock value to be negative) 2533+ */ 2534+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)(rwlock_ptr)->lock < 0) 2535+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr) ((signed int)(rwlock_ptr)->lock > 0 ) 2536+ 2537+/*Written by Carl L. to get the time base counters on ppc, 2538+ rplaces the Intel only call rtds*/ 2539+static inline long get_cycles64 (void) 2540+{ 2541+ unsigned long tb; 2542+ 2543+ /* read the upper and lower 32 bit Time base counter */ 2544+ tb = mfspr(SPRN_TBRU); 2545+ tb = (tb << 32) | mfspr(SPRN_TBRL); 2546+ 2547+ return(tb); 2548+} 2549+ 2550+#endif /* _PPC64_LOCKMETER_H */ 2551