• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1Index: linux/arch/i386/Kconfig.debug
2===================================================================
3--- linux.orig/arch/i386/Kconfig.debug	2005-11-18 14:59:18.000000000 -0800
4+++ linux/arch/i386/Kconfig.debug	2005-11-18 14:59:31.000000000 -0800
5@@ -62,6 +62,13 @@
6 	  on the VM subsystem for higher order allocations. This option
7 	  will also use IRQ stacks to compensate for the reduced stackspace.
8
9+config LOCKMETER
10+	bool "Kernel lock metering"
11+	depends on SMP
12+	help
13+	  Say Y to enable kernel lock metering, which adds overhead to SMP locks,
14+	  but allows you to see various statistics using the lockstat command.
15+
16 config X86_FIND_SMP_CONFIG
17 	bool
18 	depends on X86_LOCAL_APIC || X86_VOYAGER
19Index: linux/arch/ia64/Kconfig.debug
20===================================================================
21--- linux.orig/arch/ia64/Kconfig.debug	2005-11-18 14:59:18.000000000 -0800
22+++ linux/arch/ia64/Kconfig.debug	2005-11-18 14:59:31.000000000 -0800
23@@ -72,4 +72,11 @@
24 	depends on COMPAT && SYSVIPC
25 	default y
26
27+config LOCKMETER
28+       bool "Kernel lock metering"
29+       depends on SMP
30+       help
31+         Say Y to enable kernel lock metering, which adds overhead to SMP locks,
32+         but allows you to see various statistics using the lockstat command.
33+
34 endmenu
35Index: linux/fs/proc/proc_misc.c
36===================================================================
37--- linux.orig/fs/proc/proc_misc.c	2005-11-18 14:59:20.000000000 -0800
38+++ linux/fs/proc/proc_misc.c	2005-11-18 14:59:31.000000000 -0800
39@@ -563,6 +563,36 @@
40 		entry->proc_fops = f;
41 }
42
43+#ifdef CONFIG_LOCKMETER
44+extern ssize_t get_lockmeter_info(char *, size_t, loff_t *);
45+extern ssize_t put_lockmeter_info(const char *, size_t);
46+extern int get_lockmeter_info_size(void);
47+
48+/*
49+ * This function accesses lock metering information.
50+ */
51+static ssize_t read_lockmeter(struct file *file, char *buf,
52+			      size_t count, loff_t *ppos)
53+{
54+	return get_lockmeter_info(buf, count, ppos);
55+}
56+
57+/*
58+ * Writing to /proc/lockmeter resets the counters
59+ */
60+static ssize_t write_lockmeter(struct file * file, const char * buf,
61+			       size_t count, loff_t *ppos)
62+{
63+	return put_lockmeter_info(buf, count);
64+}
65+
66+static struct file_operations proc_lockmeter_operations = {
67+	NULL,           /* lseek */
68+	read:		read_lockmeter,
69+	write:		write_lockmeter,
70+};
71+#endif  /* CONFIG_LOCKMETER */
72+
73 void __init proc_misc_init(void)
74 {
75 	struct proc_dir_entry *entry;
76@@ -629,6 +659,13 @@
77 	if (entry)
78 		entry->proc_fops = &proc_sysrq_trigger_operations;
79 #endif
80+#ifdef CONFIG_LOCKMETER
81+	entry = create_proc_entry("lockmeter", S_IWUSR | S_IRUGO, NULL);
82+	if (entry) {
83+		entry->proc_fops = &proc_lockmeter_operations;
84+		entry->size = get_lockmeter_info_size();
85+	}
86+#endif
87 #ifdef CONFIG_PPC32
88 	{
89 		extern struct file_operations ppc_htab_operations;
90Index: linux/include/asm-alpha/lockmeter.h
91===================================================================
92--- /dev/null	1970-01-01 00:00:00.000000000 +0000
93+++ linux/include/asm-alpha/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
94@@ -0,0 +1,42 @@
95+/*
96+ *  Written by John Hawkes (hawkes@sgi.com)
97+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
98+ *
99+ *  Modified by Peter Rival (frival@zk3.dec.com)
100+ */
101+
102+#ifndef _ALPHA_LOCKMETER_H
103+#define _ALPHA_LOCKMETER_H
104+
105+#include <asm/hwrpb.h>
106+#define CPU_CYCLE_FREQUENCY	hwrpb->cycle_freq
107+
108+#define get_cycles64()		get_cycles()
109+
110+#define THIS_CPU_NUMBER		smp_processor_id()
111+
112+#include <linux/version.h>
113+
114+#define SPINLOCK_MAGIC_INIT /**/
115+
116+/*
117+ * return true if rwlock is write locked
118+ * (note that other lock attempts can cause the lock value to be negative)
119+ */
120+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) (((inst_rwlock_t *)rwlock_ptr)->lock & 1)
121+#define IABS(x) ((x) > 0 ? (x) : -(x))
122+
123+#define RWLOCK_READERS(rwlock_ptr)	rwlock_readers(rwlock_ptr)
124+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
125+{
126+	int tmp = (int) ((inst_rwlock_t *)rwlock_ptr)->raw_lock.lock;
127+	/* readers subtract 2, so we have to:		*/
128+	/* 	- andnot off a possible writer (bit 0)	*/
129+	/*	- get the absolute value		*/
130+	/*	- divide by 2 (right shift by one)	*/
131+	/* to find the number of readers		*/
132+	if (tmp == 0) return(0);
133+	else return(IABS(tmp & ~1)>>1);
134+}
135+
136+#endif /* _ALPHA_LOCKMETER_H */
137Index: linux/include/asm-i386/lockmeter.h
138===================================================================
139--- /dev/null	1970-01-01 00:00:00.000000000 +0000
140+++ linux/include/asm-i386/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
141@@ -0,0 +1,77 @@
142+/*
143+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
144+ *
145+ *  Written by John Hawkes (hawkes@sgi.com)
146+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
147+ *
148+ *  Modified by Ray Bryant (raybry@us.ibm.com)
149+ *  Changes Copyright (C) 2000 IBM, Inc.
150+ *  Added save of index in spinlock_t to improve efficiency
151+ *  of "hold" time reporting for spinlocks.
152+ *  Added support for hold time statistics for read and write
153+ *  locks.
154+ *  Moved machine dependent code here from include/lockmeter.h.
155+ *
156+ */
157+
158+#ifndef _I386_LOCKMETER_H
159+#define _I386_LOCKMETER_H
160+
161+#include <asm/spinlock.h>
162+#include <asm/rwlock.h>
163+
164+#include <linux/version.h>
165+
166+#ifdef __KERNEL__
167+extern unsigned int cpu_khz;
168+#define CPU_CYCLE_FREQUENCY	(cpu_khz * 1000)
169+#else
170+#define CPU_CYCLE_FREQUENCY	450000000
171+#endif
172+
173+#define THIS_CPU_NUMBER		smp_processor_id()
174+
175+/*
176+ * return the number of readers for a rwlock_t
177+ */
178+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
179+
180+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
181+{
182+	int tmp = (int) rwlock_ptr->raw_lock.lock;
183+	/* read and write lock attempts may cause the lock value to temporarily */
184+	/* be negative.  Until it is >= 0 we know nothing (i. e. can't tell if  */
185+	/* is -1 because it was write locked and somebody tried to read lock it */
186+	/* or if it is -1 because it was read locked and somebody tried to write*/
187+	/* lock it. ........................................................... */
188+	do {
189+		tmp = (int) rwlock_ptr->raw_lock.lock;
190+	} while (tmp < 0);
191+	if (tmp == 0) return(0);
192+	else return(RW_LOCK_BIAS-tmp);
193+}
194+
195+/*
196+ * return true if rwlock is write locked
197+ * (note that other lock attempts can cause the lock value to be negative)
198+ */
199+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock <= 0)
200+#define IABS(x) ((x) > 0 ? (x) : -(x))
201+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((IABS((rwlock_ptr)->raw_lock.lock) % RW_LOCK_BIAS) != 0)
202+
203+/* this is a lot of typing just to get gcc to emit "rdtsc" */
204+static inline long long get_cycles64 (void)
205+{
206+	union longlong_u {
207+		long long intlong;
208+		struct intint_s {
209+			uint32_t eax;
210+			uint32_t edx;
211+		} intint;
212+	} longlong;
213+
214+	rdtsc(longlong.intint.eax,longlong.intint.edx);
215+	return longlong.intlong;
216+}
217+
218+#endif /* _I386_LOCKMETER_H */
219Index: linux/include/asm-ia64/lockmeter.h
220===================================================================
221--- /dev/null	1970-01-01 00:00:00.000000000 +0000
222+++ linux/include/asm-ia64/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
223@@ -0,0 +1,33 @@
224+/*
225+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
226+ *
227+ *  Written by John Hawkes (hawkes@sgi.com)
228+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
229+ */
230+
231+#ifndef _IA64_LOCKMETER_H
232+#define _IA64_LOCKMETER_H
233+
234+#ifdef local_cpu_data
235+#define CPU_CYCLE_FREQUENCY	local_cpu_data->itc_freq
236+#else
237+#define CPU_CYCLE_FREQUENCY	my_cpu_data.itc_freq
238+#endif
239+#define get_cycles64()		get_cycles()
240+
241+#define THIS_CPU_NUMBER		smp_processor_id()
242+
243+/*
244+ * return the number of readers for a rwlock_t
245+ */
246+#define RWLOCK_READERS(rwlock_ptr)      ((rwlock_ptr)->raw_lock.read_counter)
247+
248+/*
249+ * return true if rwlock is write locked
250+ * (note that other lock attempts can cause the lock value to be negative)
251+ */
252+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.write_lock)
253+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->raw_lock.read_counter)
254+
255+#endif /* _IA64_LOCKMETER_H */
256+
257Index: linux/include/asm-mips/lockmeter.h
258===================================================================
259--- /dev/null	1970-01-01 00:00:00.000000000 +0000
260+++ linux/include/asm-mips/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
261@@ -0,0 +1,91 @@
262+/*
263+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
264+ *
265+ *  Written by John Hawkes (hawkes@sgi.com)
266+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
267+ *  Ported to mips32 for Asita Technologies
268+ *   by D.J. Barrow ( dj.barrow@asitatechnologies.com )
269+ */
270+#ifndef _ASM_LOCKMETER_H
271+#define _ASM_LOCKMETER_H
272+
273+/* do_gettimeoffset is a function pointer on mips */
274+/* & it is not included by <linux/time.h> */
275+#include <asm/time.h>
276+#include <linux/time.h>
277+#include <asm/div64.h>
278+
279+#define SPINLOCK_MAGIC_INIT	/* */
280+
281+#define CPU_CYCLE_FREQUENCY	get_cpu_cycle_frequency()
282+
283+#define THIS_CPU_NUMBER		smp_processor_id()
284+
285+static uint32_t cpu_cycle_frequency = 0;
286+
287+static uint32_t get_cpu_cycle_frequency(void)
288+{
289+    /* a total hack, slow and invasive, but ... it works */
290+    int sec;
291+    uint32_t start_cycles;
292+    struct timeval tv;
293+
294+    if (cpu_cycle_frequency == 0) {	/* uninitialized */
295+	do_gettimeofday(&tv);
296+	sec = tv.tv_sec;	/* set up to catch the tv_sec rollover */
297+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
298+	sec = tv.tv_sec;	/* rolled over to a new sec value */
299+	start_cycles = get_cycles();
300+	while (sec == tv.tv_sec) { do_gettimeofday(&tv); }
301+	cpu_cycle_frequency = get_cycles() - start_cycles;
302+    }
303+
304+    return cpu_cycle_frequency;
305+}
306+
307+extern struct timeval xtime;
308+
309+static uint64_t get_cycles64(void)
310+{
311+    static uint64_t last_get_cycles64 = 0;
312+    uint64_t ret;
313+    unsigned long sec;
314+    unsigned long usec, usec_offset;
315+
316+again:
317+    sec  = xtime.tv_sec;
318+    usec = xtime.tv_usec;
319+    usec_offset = do_gettimeoffset();
320+    if ((xtime.tv_sec != sec)  ||
321+	(xtime.tv_usec != usec)||
322+	(usec_offset >= 20000))
323+	goto again;
324+
325+    ret = ((uint64_t)(usec + usec_offset) * cpu_cycle_frequency);
326+    /* We can't do a normal 64 bit division on mips without libgcc.a */
327+    do_div(ret,1000000);
328+    ret +=  ((uint64_t)sec * cpu_cycle_frequency);
329+
330+    /* XXX why does time go backwards?  do_gettimeoffset?  general time adj? */
331+    if (ret <= last_get_cycles64)
332+	ret  = last_get_cycles64+1;
333+    last_get_cycles64 = ret;
334+
335+    return ret;
336+}
337+
338+/*
339+ * return the number of readers for a rwlock_t
340+ */
341+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
342+
343+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
344+{
345+	int tmp = (int) rwlock_ptr->raw_lock.lock;
346+	return (tmp >= 0) ? tmp : 0;
347+}
348+
349+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock < 0)
350+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((rwlock_ptr)->raw_lock.lock > 0)
351+
352+#endif /* _ASM_LOCKMETER_H */
353Index: linux/include/asm-sparc64/lockmeter.h
354===================================================================
355--- /dev/null	1970-01-01 00:00:00.000000000 +0000
356+++ linux/include/asm-sparc64/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
357@@ -0,0 +1,37 @@
358+/*
359+ * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com)
360+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
361+ */
362+
363+#ifndef _SPARC64_LOCKMETER_H
364+#define _SPARC64_LOCKMETER_H
365+
366+#include <linux/smp.h>
367+#include <asm/spinlock.h>
368+#include <asm/timer.h>
369+#include <asm/timex.h>
370+
371+/* Actually, this is not the CPU frequency by the system tick
372+ * frequency which is good enough for lock metering.
373+ */
374+#define CPU_CYCLE_FREQUENCY	(timer_tick_offset * HZ)
375+#define THIS_CPU_NUMBER		smp_processor_id()
376+
377+#define RWLOCK_READERS(rwlock_ptr)	rwlock_readers(rwlock_ptr)
378+
379+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
380+{
381+	signed int tmp = rwlock_ptr->raw_lock.lock;
382+
383+	if (tmp > 0)
384+		return tmp;
385+	else
386+		return 0;
387+}
388+
389+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr)	((signed int)((rwlock_ptr)->raw_lock.lock) < 0)
390+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)	((signed int)((rwlock_ptr)->raw_lock.lock) > 0)
391+
392+#define get_cycles64()	get_cycles()
393+
394+#endif /* _SPARC64_LOCKMETER_H */
395Index: linux/include/asm-x86_64/lockmeter.h
396===================================================================
397--- /dev/null	1970-01-01 00:00:00.000000000 +0000
398+++ linux/include/asm-x86_64/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
399@@ -0,0 +1,64 @@
400+/*
401+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
402+ *
403+ *  Written by John Hawkes (hawkes@sgi.com)
404+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
405+ *
406+ *  Modified by Ray Bryant (raybry@us.ibm.com)
407+ *  Changes Copyright (C) 2000 IBM, Inc.
408+ *  Added save of index in spinlock_t to improve efficiency
409+ *  of "hold" time reporting for spinlocks.
410+ *  Added support for hold time statistics for read and write
411+ *  locks.
412+ *  Moved machine dependent code here from include/lockmeter.h.
413+ *
414+ */
415+
416+#ifndef _X8664_LOCKMETER_H
417+#define _X8664_LOCKMETER_H
418+
419+#include <asm/spinlock.h>
420+#include <asm/rwlock.h>
421+
422+#include <linux/version.h>
423+
424+#ifdef __KERNEL__
425+extern unsigned int cpu_khz;
426+#define CPU_CYCLE_FREQUENCY	(cpu_khz * 1000)
427+#else
428+#define CPU_CYCLE_FREQUENCY	450000000
429+#endif
430+
431+#define THIS_CPU_NUMBER		smp_processor_id()
432+
433+/*
434+ * return the number of readers for a rwlock_t
435+ */
436+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
437+
438+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
439+{
440+	int tmp = (int) rwlock_ptr->raw_lock.lock;
441+	/* read and write lock attempts may cause the lock value to temporarily */
442+	/* be negative.  Until it is >= 0 we know nothing (i. e. can't tell if  */
443+	/* is -1 because it was write locked and somebody tried to read lock it */
444+	/* or if it is -1 because it was read locked and somebody tried to write*/
445+	/* lock it. ........................................................... */
446+	do {
447+		tmp = (int) rwlock_ptr->raw_lock.lock;
448+	} while (tmp < 0);
449+	if (tmp == 0) return(0);
450+	else return(RW_LOCK_BIAS-tmp);
451+}
452+
453+/*
454+ * return true if rwlock is write locked
455+ * (note that other lock attempts can cause the lock value to be negative)
456+ */
457+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((rwlock_ptr)->raw_lock.lock <= 0)
458+#define IABS(x) ((x) > 0 ? (x) : -(x))
459+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((IABS((rwlock_ptr)->raw_lock.lock) % RW_LOCK_BIAS) != 0)
460+
461+#define get_cycles64()	get_cycles()
462+
463+#endif /* _X8664_LOCKMETER_H */
464Index: linux/include/linux/lockmeter.h
465===================================================================
466--- /dev/null	1970-01-01 00:00:00.000000000 +0000
467+++ linux/include/linux/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
468@@ -0,0 +1,342 @@
469+/*
470+ *  Copyright (C) 1999-2002 Silicon Graphics, Inc.
471+ *
472+ *  Written by John Hawkes (hawkes@sgi.com)
473+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
474+ *
475+ *  Modified by Ray Bryant (raybry@us.ibm.com) Feb-Apr 2000
476+ *  Changes Copyright (C) 2000 IBM, Inc.
477+ *  Added save of index in spinlock_t to improve efficiency
478+ *  of "hold" time reporting for spinlocks
479+ *  Added support for hold time statistics for read and write
480+ *  locks.
481+ *  Moved machine dependent code to include/asm/lockmeter.h.
482+ *
483+ */
484+
485+#ifndef _LINUX_LOCKMETER_H
486+#define _LINUX_LOCKMETER_H
487+
488+#include <linux/utsname.h>
489+
490+#ifdef CONFIG_LOCKMETER
491+
492+/*---------------------------------------------------
493+ *	architecture-independent lockmeter.h
494+ *-------------------------------------------------*/
495+
496+/*
497+ * raybry -- version 2: added efficient hold time statistics
498+ *           requires lstat recompile, so flagged as new version
499+ * raybry -- version 3: added global reader lock data
500+ * hawkes -- version 4: removed some unnecessary fields to simplify mips64 port
501+ */
502+#define LSTAT_VERSION	5
503+
504+int	lstat_update(void*, void*, int);
505+int	lstat_update_time(void*, void*, int, uint32_t);
506+
507+/*
508+ * Currently, the mips64 and sparc64 kernels talk to a 32-bit lockstat, so we
509+ * need to force compatibility in the inter-communication data structure.
510+ */
511+
512+#if defined(CONFIG_MIPS32_COMPAT)
513+#define TIME_T		uint32_t
514+#elif defined(CONFIG_SPARC) || defined(CONFIG_SPARC64)
515+#define TIME_T		uint64_t
516+#else
517+#define TIME_T		time_t
518+#endif
519+
520+#if defined(__KERNEL__) || (!defined(CONFIG_MIPS32_COMPAT) && !defined(CONFIG_SPARC) && !defined(CONFIG_SPARC64)) || (_MIPS_SZLONG==32)
521+#define POINTER		void *
522+#else
523+#define	POINTER		int64_t
524+#endif
525+
526+/*
527+ * Values for the "action" parameter passed to lstat_update.
528+ *	ZZZ - do we want a try-success status here???
529+ */
530+#define LSTAT_ACT_NO_WAIT	0
531+#define LSTAT_ACT_SPIN		1
532+#define LSTAT_ACT_REJECT	2
533+#define LSTAT_ACT_WW_SPIN       3
534+#define LSTAT_ACT_SLEPT		4 /* UNUSED */
535+
536+#define LSTAT_ACT_MAX_VALUES	4 /* NOTE: Increase to 5 if use ACT_SLEPT */
537+
538+/*
539+ * Special values for the low 2 bits of an RA passed to
540+ * lstat_update.
541+ */
542+/* we use these values to figure out what kind of lock data */
543+/* is stored in the statistics table entry at index ....... */
544+#define LSTAT_RA_SPIN           0  /* spin lock data */
545+#define LSTAT_RA_READ           1  /* read lock statistics */
546+#define LSTAT_RA_SEMA		2  /* RESERVED */
547+#define LSTAT_RA_WRITE          3  /* write lock statistics*/
548+
549+#define LSTAT_RA(n)	\
550+	((void*)( ((unsigned long) caller_pc & ~3) | n) )
551+
552+/*
553+ * Constants used for lock addresses in the lstat_directory
554+ * to indicate special values of the lock address.
555+ */
556+#define	LSTAT_MULTI_LOCK_ADDRESS	NULL
557+
558+/*
559+ * Maximum size of the lockstats tables. Increase this value
560+ * if its not big enough. (Nothing bad happens if its not
561+ * big enough although some locks will not be monitored.)
562+ * We record overflows of this quantity in lstat_control.dir_overflows
563+ *
564+ * Note:  The max value here must fit into the field set
565+ * and obtained by the macro's PUT_INDEX() and GET_INDEX().
566+ * This value depends on how many bits are available in the
567+ * lock word in the particular machine implementation we are on.
568+ */
569+#define LSTAT_MAX_STAT_INDEX		2000
570+
571+/*
572+ * Size and mask for the hash table into the directory.
573+ */
574+#define LSTAT_HASH_TABLE_SIZE		4096		/* must be 2**N */
575+#define LSTAT_HASH_TABLE_MASK		(LSTAT_HASH_TABLE_SIZE-1)
576+
577+#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
578+
579+/*
580+ *	This defines an entry in the lockstat directory. It contains
581+ *	information about a lock being monitored.
582+ *	A directory entry only contains the lock identification -
583+ *	counts on usage of the lock are kept elsewhere in a per-cpu
584+ *	data structure to minimize cache line pinging.
585+ */
586+typedef struct {
587+	POINTER	caller_ra;		  /* RA of code that set lock */
588+	POINTER	lock_ptr;		  /* lock address */
589+	ushort	next_stat_index;  /* Used to link multiple locks that have the same hash table value */
590+} lstat_directory_entry_t;
591+
592+/*
593+ *	A multi-dimensioned array used to contain counts for lock accesses.
594+ *	The array is 3-dimensional:
595+ *		- CPU number. Keep from thrashing cache lines between CPUs
596+ *		- Directory entry index. Identifies the lock
597+ *		- Action. Indicates what kind of contention occurred on an
598+ *		  access to the lock.
599+ *
600+ *	The index of an entry in the directory is the same as the 2nd index
601+ *	of the entry in the counts array.
602+ */
603+/*
604+ *  This table contains data for spin_locks, write locks, and read locks
605+ *  Not all data is used for all cases.  In particular, the hold time
606+ *  information is not stored here for read locks since that is a global
607+ *  (e. g. cannot be separated out by return address) quantity.
608+ *  See the lstat_read_lock_counts_t structure for the global read lock
609+ *  hold time.
610+ */
611+typedef struct {
612+	uint64_t    cum_wait_ticks;	/* sum of wait times               */
613+	                                /* for write locks, sum of time a  */
614+					/* writer is waiting for a reader  */
615+	int64_t	    cum_hold_ticks;	/* cumulative sum of holds         */
616+	                                /* not used for read mode locks    */
617+					/* must be signed. ............... */
618+	uint32_t    max_wait_ticks;	/* max waiting time                */
619+	uint32_t    max_hold_ticks;	/* max holding time                */
620+	uint64_t    cum_wait_ww_ticks;  /* sum times writer waits on writer*/
621+	uint32_t    max_wait_ww_ticks;  /* max wait time writer vs writer  */
622+	                                /* prev 2 only used for write locks*/
623+	uint32_t    acquire_time;       /* time lock acquired this CPU     */
624+	uint32_t    count[LSTAT_ACT_MAX_VALUES];
625+} lstat_lock_counts_t;
626+
627+typedef lstat_lock_counts_t	lstat_cpu_counts_t[LSTAT_MAX_STAT_INDEX];
628+
629+/*
630+ * User request to:
631+ *	- turn statistic collection on/off, or to reset
632+ */
633+#define LSTAT_OFF	 0
634+#define LSTAT_ON	 1
635+#define LSTAT_RESET      2
636+#define LSTAT_RELEASE    3
637+
638+#define LSTAT_MAX_READ_LOCK_INDEX 1000
639+typedef struct {
640+	POINTER	    lock_ptr;            /* address of lock for output stats */
641+	uint32_t    read_lock_count;
642+	int64_t     cum_hold_ticks;       /* sum of read lock hold times over */
643+	                                  /* all callers. ....................*/
644+	uint32_t    write_index;          /* last write lock hash table index */
645+	uint32_t    busy_periods;         /* count of busy periods ended this */
646+	uint64_t    start_busy;           /* time this busy period started. ..*/
647+	uint64_t    busy_ticks;           /* sum of busy periods this lock. ..*/
648+	uint64_t    max_busy;             /* longest busy period for this lock*/
649+	uint32_t    max_readers;          /* maximum number of readers ...... */
650+#ifdef USER_MODE_TESTING
651+	rwlock_t    entry_lock;           /* lock for this read lock entry... */
652+	                                  /* avoid having more than one rdr at*/
653+	                                  /* needed for user space testing... */
654+	                                  /* not needed for kernel 'cause it  */
655+					  /* is non-preemptive. ............. */
656+#endif
657+} lstat_read_lock_counts_t;
658+typedef lstat_read_lock_counts_t	lstat_read_lock_cpu_counts_t[LSTAT_MAX_READ_LOCK_INDEX];
659+
660+#if defined(__KERNEL__) || defined(USER_MODE_TESTING)
661+
662+/*
663+ * macros to cache and retrieve an index value inside of a lock
664+ * these macros assume that there are less than 65536 simultaneous
665+ * (read mode) holders of a rwlock.
666+ * we also assume that the hash table has less than 32767 entries.
667+ */
668+#define PUT_INDEX(lock_ptr,indexv) (lock_ptr)->index = indexv
669+#define GET_INDEX(lock_ptr)        (lock_ptr)->index
670+
671+#define PUT_RWINDEX(rwlock_ptr,indexv) (rwlock_ptr)->index = indexv
672+#define GET_RWINDEX(rwlock_ptr)        (rwlock_ptr)->index
673+#define PUT_RW_CPU(rwlock_ptr,cpuv)    (rwlock_ptr)->cpu = cpuv
674+#define GET_RW_CPU(rwlock_ptr)         (rwlock_ptr)->cpu
675+
676+#ifndef USER_MODE_TESTING
677+#include <asm/lockmeter.h>
678+#else
679+#include "asm_newlockmeter.h"
680+#endif
681+
682+/*
683+ * Size and mask for the hash table into the directory.
684+ */
685+#define LSTAT_HASH_TABLE_SIZE		4096		/* must be 2**N */
686+#define LSTAT_HASH_TABLE_MASK		(LSTAT_HASH_TABLE_SIZE-1)
687+
688+#define DIRHASH(ra)      ((unsigned long)(ra)>>2 & LSTAT_HASH_TABLE_MASK)
689+
690+/*
691+ * This version eliminates the per processor lock stack.  What we do is to
692+ * store the index of the lock hash structure in unused bits in the lock
693+ * itself.  Then on unlock we can find the statistics record without doing
694+ * any additional hash or lock stack lookup.  This works for spin_locks.
695+ * Hold time reporting is now basically as cheap as wait time reporting
696+ * so we ignore the difference between LSTAT_ON_HOLD and LSTAT_ON_WAIT
697+ * as in version 1.1.* of lockmeter.
698+ *
699+ * For rw_locks, we store the index of a global reader stats structure in
700+ * the lock and the writer index is stored in the latter structure.
701+ * For read mode locks we hash at the time of the lock to find an entry
702+ * in the directory for reader wait time and the like.
703+ * At unlock time for read mode locks, we update just the global structure
704+ * so we don't need to know the reader directory index value at unlock time.
705+ *
706+ */
707+
708+/*
709+ * Protocol to change lstat_control.state
710+ *   This is complicated because we don't want the cum_hold_time for
711+ * a rw_lock to be decremented in _read_lock_ without making sure it
712+ * is incremented in _read_lock_ and vice versa.  So here is the
713+ * way we change the state of lstat_control.state:
714+ * I.  To Turn Statistics On
715+ *     After allocating storage, set lstat_control.state non-zero.
716+ * This works because we don't start updating statistics for in use
717+ * locks until the reader lock count goes to zero.
718+ * II. To Turn Statistics Off:
719+ * (0)  Disable interrupts on this CPU
720+ * (1)  Seize the lstat_control.directory_lock
721+ * (2)  Obtain the current value of lstat_control.next_free_read_lock_index
722+ * (3)  Store a zero in lstat_control.state.
723+ * (4)  Release the lstat_control.directory_lock
724+ * (5)  For each lock in the read lock list up to the saved value
725+ *      (well, -1) of the next_free_read_lock_index, do the following:
726+ *      (a)  Check validity of the stored lock address
727+ *           by making sure that the word at the saved addr
728+ *           has an index that matches this entry.  If not
729+ *           valid, then skip this entry.
730+ *      (b)  If there is a write lock already set on this lock,
731+ *           skip to (d) below.
732+ *      (c)  Set a non-metered write lock on the lock
733+ *      (d)  set the cached INDEX in the lock to zero
734+ *      (e)  Release the non-metered write lock.
735+ * (6)  Re-enable interrupts
736+ *
737+ * These rules ensure that a read lock will not have its statistics
738+ * partially updated even though the global lock recording state has
739+ * changed.  See put_lockmeter_info() for implementation.
740+ *
741+ * The reason for (b) is that there may be write locks set on the
742+ * syscall path to put_lockmeter_info() from user space.  If we do
743+ * not do this check, then we can deadlock.  A similar problem would
744+ * occur if the lock was read locked by the current CPU.  At the
745+ * moment this does not appear to happen.
746+ */
747+
748+/*
749+ * Main control structure for lockstat. Used to turn statistics on/off
750+ * and to maintain directory info.
751+ */
752+typedef struct {
753+	int				state;
754+	spinlock_t		control_lock;		/* used to serialize turning statistics on/off   */
755+	spinlock_t		directory_lock;		/* for serialize adding entries to directory     */
756+	volatile int	next_free_dir_index;/* next free entry in the directory */
757+	/* FIXME not all of these fields are used / needed .............. */
758+                /* the following fields represent data since     */
759+		/* first "lstat on" or most recent "lstat reset" */
760+	TIME_T      first_started_time;     /* time when measurement first enabled */
761+	TIME_T      started_time;           /* time when measurement last started  */
762+	TIME_T      ending_time;            /* time when measurement last disabled */
763+	uint64_t    started_cycles64;       /* cycles when measurement last started          */
764+	uint64_t    ending_cycles64;        /* cycles when measurement last disabled         */
765+	uint64_t    enabled_cycles64;       /* total cycles with measurement enabled         */
766+	int         intervals;              /* number of measurement intervals recorded      */
767+	                                    /* i. e. number of times did lstat on;lstat off  */
768+	lstat_directory_entry_t	*dir;		/* directory */
769+	int         dir_overflow;           /* count of times ran out of space in directory  */
770+	int         rwlock_overflow;        /* count of times we couldn't allocate a rw block*/
771+	ushort		*hashtab;		 	    /* hash table for quick dir scans */
772+	lstat_cpu_counts_t	*counts[NR_CPUS];	 /* Array of pointers to per-cpu stats */
773+    int         next_free_read_lock_index;   /* next rwlock reader (global) stats block  */
774+    lstat_read_lock_cpu_counts_t *read_lock_counts[NR_CPUS]; /* per cpu read lock stats  */
775+} lstat_control_t;
776+
777+#endif	/* defined(__KERNEL__) || defined(USER_MODE_TESTING) */
778+
779+typedef struct {
780+	short		lstat_version;		/* version of the data */
781+	short		state;			/* the current state is returned */
782+	int		maxcpus;		/* Number of cpus present */
783+	int		next_free_dir_index;	/* index of the next free directory entry */
784+	TIME_T          first_started_time;	/* when measurement enabled for first time */
785+	TIME_T          started_time;		/* time in secs since 1969 when stats last turned on  */
786+	TIME_T		ending_time;		/* time in secs since 1969 when stats last turned off */
787+	uint32_t	cycleval;		/* cycles per second */
788+#ifdef notyet
789+	void		*kernel_magic_addr;	/* address of kernel_magic */
790+	void		*kernel_end_addr;	/* contents of kernel magic (points to "end") */
791+#endif
792+	int              next_free_read_lock_index; /* index of next (global) read lock stats struct */
793+	uint64_t         started_cycles64;	/* cycles when measurement last started        */
794+	uint64_t         ending_cycles64;	/* cycles when stats last turned off           */
795+	uint64_t         enabled_cycles64;	/* total cycles with measurement enabled       */
796+	int              intervals;		/* number of measurement intervals recorded      */
797+						/* i.e. number of times we did lstat on;lstat off*/
798+	int              dir_overflow;		/* number of times we wanted more space in directory */
799+	int              rwlock_overflow;	/* # of times we wanted more space in read_locks_count */
800+	struct new_utsname   uts;		/* info about machine where stats are measured */
801+						/* -T option of lockstat allows data to be     */
802+						/* moved to another machine. ................. */
803+} lstat_user_request_t;
804+
805+#else
806+XXX fix:  defines for _metered routines
807+
808+#endif
809+
810+#endif /* _LINUX_LOCKMETER_H */
811Index: linux/kernel/lockmeter.c
812===================================================================
813--- /dev/null	1970-01-01 00:00:00.000000000 +0000
814+++ linux/kernel/lockmeter.c	2005-11-18 14:59:31.000000000 -0800
815@@ -0,0 +1,1251 @@
816+/*
817+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
818+ *
819+ *  Written by John Hawkes (hawkes@sgi.com)
820+ *  Based on klstat.c by Jack Steiner (steiner@sgi.com)
821+ *
822+ *  Modified by Ray Bryant (raybry@us.ibm.com)
823+ *  Changes Copyright (C) 2000 IBM, Inc.
824+ *  Added save of index in spinlock_t to improve efficiency
825+ *  of "hold" time reporting for spinlocks
826+ *  Added support for hold time statistics for read and write
827+ *  locks.
828+ *
829+ *  Modified by Ray Bryant (raybry@sgi.com)
830+ *  Changes Copyright (C) 2004, Silicon Graphics, Inc.
831+ *  Fix to work with out-of-line spinlocks.
832+ */
833+
834+#include <linux/config.h>
835+#include <linux/linkage.h>
836+#include <linux/preempt.h>
837+#include <linux/interrupt.h>
838+#include <linux/module.h>
839+#include <linux/types.h>
840+#include <linux/errno.h>
841+#include <linux/slab.h>
842+#include <linux/sched.h>
843+#include <linux/smp.h>
844+#include <linux/threads.h>
845+#include <linux/version.h>
846+#include <linux/vmalloc.h>
847+#include <linux/spinlock.h>
848+#include <linux/utsname.h>
849+#include <linux/module.h>
850+#include <asm/system.h>
851+#include <asm/uaccess.h>
852+
853+#include <linux/lockmeter.h>
854+
855+#define ASSERT(cond)
856+#define bzero(loc,size)		memset(loc,0,size)
857+
858+/*<---------------------------------------------------*/
859+/*              lockmeter.c                           */
860+/*>---------------------------------------------------*/
861+
862+static lstat_control_t lstat_control __cacheline_aligned =
863+	{ LSTAT_OFF, SPIN_LOCK_UNLOCKED, SPIN_LOCK_UNLOCKED,
864+	  19 * 0, NR_CPUS * 0, 0, NR_CPUS * 0 };
865+
866+static ushort lstat_make_dir_entry(void *, void *);
867+
868+/*
869+ * lstat_lookup
870+ *
871+ * Given a RA, locate the directory entry for the lock.
872+ */
873+static ushort
874+lstat_lookup(void *lock_ptr, void *caller_ra)
875+{
876+	ushort index;
877+	lstat_directory_entry_t *dirp;
878+
879+	dirp = lstat_control.dir;
880+
881+	index = lstat_control.hashtab[DIRHASH(caller_ra)];
882+	while (dirp[index].caller_ra != caller_ra) {
883+		if (index == 0) {
884+			return lstat_make_dir_entry(lock_ptr, caller_ra);
885+		}
886+		index = dirp[index].next_stat_index;
887+	}
888+
889+	if (dirp[index].lock_ptr != NULL && dirp[index].lock_ptr != lock_ptr) {
890+		dirp[index].lock_ptr = NULL;
891+	}
892+
893+	return index;
894+}
895+
896+/*
897+ * lstat_make_dir_entry
898+ * Called to add a new lock to the lock directory.
899+ */
900+static ushort
901+lstat_make_dir_entry(void *lock_ptr, void *caller_ra)
902+{
903+	lstat_directory_entry_t *dirp;
904+	ushort index, hindex;
905+	unsigned long flags;
906+
907+	/* lock the table without recursively reentering this metering code */
908+	local_irq_save(flags);
909+	_raw_spin_lock(&lstat_control.directory_lock);
910+
911+	hindex = DIRHASH(caller_ra);
912+	index = lstat_control.hashtab[hindex];
913+	dirp = lstat_control.dir;
914+	while (index && dirp[index].caller_ra != caller_ra)
915+		index = dirp[index].next_stat_index;
916+
917+	if (index == 0) {
918+		if (lstat_control.next_free_dir_index < LSTAT_MAX_STAT_INDEX) {
919+			index = lstat_control.next_free_dir_index++;
920+			lstat_control.dir[index].caller_ra = caller_ra;
921+			lstat_control.dir[index].lock_ptr = lock_ptr;
922+			lstat_control.dir[index].next_stat_index =
923+				lstat_control.hashtab[hindex];
924+			lstat_control.hashtab[hindex] = index;
925+		} else {
926+			lstat_control.dir_overflow++;
927+		}
928+	}
929+	_raw_spin_unlock(&lstat_control.directory_lock);
930+	local_irq_restore(flags);
931+	return index;
932+}
933+
934+int
935+lstat_update(void *lock_ptr, void *caller_ra, int action)
936+{
937+	int index;
938+	int cpu;
939+
940+	ASSERT(action < LSTAT_ACT_MAX_VALUES);
941+
942+	if (lstat_control.state == LSTAT_OFF)
943+		return 0;
944+
945+	index = lstat_lookup(lock_ptr, caller_ra);
946+	cpu = THIS_CPU_NUMBER;
947+	(*lstat_control.counts[cpu])[index].count[action]++;
948+	(*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
949+
950+	return index;
951+}
952+
953+int
954+lstat_update_time(void *lock_ptr, void *caller_ra, int action, uint32_t ticks)
955+{
956+	ushort index;
957+	int cpu;
958+
959+	ASSERT(action < LSTAT_ACT_MAX_VALUES);
960+
961+	if (lstat_control.state == LSTAT_OFF)
962+		return 0;
963+
964+	index = lstat_lookup(lock_ptr, caller_ra);
965+	cpu = THIS_CPU_NUMBER;
966+	(*lstat_control.counts[cpu])[index].count[action]++;
967+	(*lstat_control.counts[cpu])[index].cum_wait_ticks += (uint64_t) ticks;
968+	if ((*lstat_control.counts[cpu])[index].max_wait_ticks < ticks)
969+		(*lstat_control.counts[cpu])[index].max_wait_ticks = ticks;
970+
971+	(*lstat_control.counts[cpu])[index].acquire_time = get_cycles();
972+
973+	return index;
974+}
975+
976+void
977+_metered_spin_lock(spinlock_t * lock_ptr, void *caller_pc)
978+{
979+	if (lstat_control.state == LSTAT_OFF) {
980+		_raw_spin_lock(lock_ptr);	/* do the real lock */
981+		PUT_INDEX(lock_ptr, 0);	/* clean index in case lockmetering  */
982+		/* gets turned on before unlock */
983+	} else {
984+		void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
985+		int index;
986+
987+		if (_raw_spin_trylock(lock_ptr)) {
988+			index = lstat_update(lock_ptr, this_pc,
989+						LSTAT_ACT_NO_WAIT);
990+		} else {
991+			uint32_t start_cycles = get_cycles();
992+			_raw_spin_lock(lock_ptr);	/* do the real lock */
993+			index = lstat_update_time(lock_ptr, this_pc,
994+				LSTAT_ACT_SPIN, get_cycles() - start_cycles);
995+		}
996+		/* save the index in the lock itself for use in spin unlock */
997+		PUT_INDEX(lock_ptr, index);
998+	}
999+}
1000+/* some archs require this for atomic_dec_and_lock in modules */
1001+EXPORT_SYMBOL(_metered_spin_lock);
1002+
1003+void
1004+_metered_spin_lock_flags(spinlock_t * lock_ptr, unsigned long *flags,
1005+			 void *caller_pc)
1006+{
1007+	if (lstat_control.state == LSTAT_OFF) {
1008+		_raw_spin_lock_flags(lock_ptr, flags);	/* do the real lock */
1009+		PUT_INDEX(lock_ptr, 0);	/* clean index in case lockmetering  */
1010+		/* gets turned on before unlock */
1011+	} else {
1012+		void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
1013+		int index;
1014+
1015+		if (_raw_spin_trylock(lock_ptr)) {
1016+			index = lstat_update(lock_ptr, this_pc,
1017+						LSTAT_ACT_NO_WAIT);
1018+		} else {
1019+			uint32_t start_cycles = get_cycles();
1020+			/* do the real lock */
1021+			_raw_spin_lock_flags(lock_ptr, flags);
1022+			index = lstat_update_time(lock_ptr, this_pc,
1023+				LSTAT_ACT_SPIN, get_cycles() - start_cycles);
1024+		}
1025+		/* save the index in the lock itself for use in spin unlock */
1026+		PUT_INDEX(lock_ptr, index);
1027+	}
1028+}
1029+
1030+int
1031+_metered_spin_trylock(spinlock_t * lock_ptr, void *caller_pc)
1032+{
1033+	if (lstat_control.state == LSTAT_OFF) {
1034+		return _raw_spin_trylock(lock_ptr);
1035+	} else {
1036+		int retval;
1037+		void *this_pc = LSTAT_RA(LSTAT_RA_SPIN);
1038+
1039+		if ((retval = _raw_spin_trylock(lock_ptr))) {
1040+			int index = lstat_update(lock_ptr, this_pc,
1041+						LSTAT_ACT_NO_WAIT);
1042+			/*
1043+			 * save the index in the lock itself for use in spin
1044+			 * unlock
1045+			 */
1046+			PUT_INDEX(lock_ptr, index);
1047+		} else {
1048+			lstat_update(lock_ptr, this_pc, LSTAT_ACT_REJECT);
1049+		}
1050+
1051+		return retval;
1052+	}
1053+}
1054+
1055+void
1056+_metered_spin_unlock(spinlock_t * lock_ptr)
1057+{
1058+	int index = -1;
1059+
1060+	if (lstat_control.state != LSTAT_OFF) {
1061+		index = GET_INDEX(lock_ptr);
1062+		/*
1063+		 * If statistics were turned off when we set the lock,
1064+		 * then the index can be zero.  If that is the case,
1065+		 * then collect no stats on this call.
1066+		 */
1067+		if (index > 0) {
1068+			uint32_t hold_time;
1069+			int cpu = THIS_CPU_NUMBER;
1070+			hold_time = get_cycles() -
1071+			 (*lstat_control.counts[cpu])[index].acquire_time;
1072+			(*lstat_control.counts[cpu])[index].cum_hold_ticks +=
1073+				(uint64_t) hold_time;
1074+			if ((*lstat_control.counts[cpu])[index].max_hold_ticks <
1075+			    hold_time)
1076+				(*lstat_control.counts[cpu])[index].
1077+				    max_hold_ticks = hold_time;
1078+		}
1079+	}
1080+
1081+	/* make sure we don't have a stale index value saved */
1082+	PUT_INDEX(lock_ptr, 0);
1083+	_raw_spin_unlock(lock_ptr);	/* do the real unlock */
1084+}
1085+/* some archs require this for atomic_dec_and_lock in modules*/
1086+EXPORT_SYMBOL(_metered_spin_unlock);
1087+
1088+/*
1089+ * allocate the next global read lock structure and store its index
1090+ * in the rwlock at "lock_ptr".
1091+ */
1092+uint32_t
1093+alloc_rwlock_struct(rwlock_t * rwlock_ptr)
1094+{
1095+	int index;
1096+	unsigned long flags;
1097+	int cpu = THIS_CPU_NUMBER;
1098+
1099+	/* If we've already overflowed, then do a quick exit */
1100+	if (lstat_control.next_free_read_lock_index >
1101+			LSTAT_MAX_READ_LOCK_INDEX) {
1102+		lstat_control.rwlock_overflow++;
1103+		return 0;
1104+	}
1105+
1106+	local_irq_save(flags);
1107+	_raw_spin_lock(&lstat_control.directory_lock);
1108+
1109+	/* It is possible this changed while we were waiting for the directory_lock */
1110+	if (lstat_control.state == LSTAT_OFF) {
1111+		index = 0;
1112+		goto unlock;
1113+	}
1114+
1115+	/* It is possible someone else got here first and set the index */
1116+	if ((index = GET_RWINDEX(rwlock_ptr)) == 0) {
1117+		/*
1118+		 * we can't turn on read stats for this lock while there are
1119+		 * readers (this would mess up the running hold time sum at
1120+		 * unlock time)
1121+		 */
1122+		if (RWLOCK_READERS(rwlock_ptr) != 0) {
1123+			index = 0;
1124+			goto unlock;
1125+		}
1126+
1127+		/*
1128+		 * if stats are turned on after being off, we may need to
1129+		 * return an old index from when the statistics were on last
1130+		 * time.
1131+		 */
1132+		for (index = 1; index < lstat_control.next_free_read_lock_index;
1133+				index++)
1134+			if ((*lstat_control.read_lock_counts[cpu])[index].
1135+					lock_ptr == rwlock_ptr)
1136+				goto put_index_and_unlock;
1137+
1138+		/* allocate the next global read lock structure */
1139+		if (lstat_control.next_free_read_lock_index >=
1140+		    LSTAT_MAX_READ_LOCK_INDEX) {
1141+			lstat_control.rwlock_overflow++;
1142+			index = 0;
1143+			goto unlock;
1144+		}
1145+		index = lstat_control.next_free_read_lock_index++;
1146+
1147+		/*
1148+		 * initialize the global read stats data structure for each
1149+		 * cpu
1150+		 */
1151+		for_each_online_cpu(cpu) {
1152+			(*lstat_control.read_lock_counts[cpu])[index].lock_ptr =
1153+				rwlock_ptr;
1154+		}
1155+put_index_and_unlock:
1156+		/* store the index for the read lock structure into the lock */
1157+		PUT_RWINDEX(rwlock_ptr, index);
1158+	}
1159+
1160+unlock:
1161+	_raw_spin_unlock(&lstat_control.directory_lock);
1162+	local_irq_restore(flags);
1163+	return index;
1164+}
1165+
1166+void
1167+_metered_read_lock(rwlock_t * rwlock_ptr, void *caller_pc)
1168+{
1169+	void *this_pc;
1170+	uint32_t start_cycles;
1171+	int index;
1172+	int cpu;
1173+	unsigned long flags;
1174+	int readers_before, readers_after;
1175+	uint64_t cycles64;
1176+
1177+	if (lstat_control.state == LSTAT_OFF) {
1178+		_raw_read_lock(rwlock_ptr);
1179+		/* clean index in case lockmetering turns on before an unlock */
1180+		PUT_RWINDEX(rwlock_ptr, 0);
1181+		return;
1182+	}
1183+
1184+	this_pc = LSTAT_RA(LSTAT_RA_READ);
1185+	cpu = THIS_CPU_NUMBER;
1186+	index = GET_RWINDEX(rwlock_ptr);
1187+
1188+	/* allocate the global stats entry for this lock, if needed */
1189+	if (index == 0)
1190+		index = alloc_rwlock_struct(rwlock_ptr);
1191+
1192+	readers_before = RWLOCK_READERS(rwlock_ptr);
1193+	if (_raw_read_trylock(rwlock_ptr)) {
1194+		/*
1195+		 * We have decremented the lock to count a new reader,
1196+		 * and have confirmed that no writer has it locked.
1197+		 */
1198+		/* update statistics if enabled */
1199+		if (index > 0) {
1200+			local_irq_save(flags);
1201+			lstat_update((void *) rwlock_ptr, this_pc,
1202+					LSTAT_ACT_NO_WAIT);
1203+			/* preserve value of TSC so cum_hold_ticks and start_busy use same value */
1204+			cycles64 = get_cycles64();
1205+			(*lstat_control.read_lock_counts[cpu])[index].
1206+				cum_hold_ticks -= cycles64;
1207+
1208+			/* record time and cpu of start of busy period */
1209+			/* this is not perfect (some race conditions are possible) */
1210+			if (readers_before == 0) {
1211+				(*lstat_control.read_lock_counts[cpu])[index].
1212+					start_busy = cycles64;
1213+				PUT_RW_CPU(rwlock_ptr, cpu);
1214+			}
1215+			readers_after = RWLOCK_READERS(rwlock_ptr);
1216+			if (readers_after >
1217+				(*lstat_control.read_lock_counts[cpu])[index].
1218+					max_readers)
1219+				(*lstat_control.read_lock_counts[cpu])[index].
1220+					max_readers = readers_after;
1221+			local_irq_restore(flags);
1222+		}
1223+
1224+		return;
1225+	}
1226+	/* If we get here, then we could not quickly grab the read lock */
1227+
1228+	start_cycles = get_cycles();	/* start counting the wait time */
1229+
1230+	/* Now spin until read_lock is successful */
1231+	_raw_read_lock(rwlock_ptr);
1232+
1233+	lstat_update_time((void *) rwlock_ptr, this_pc, LSTAT_ACT_SPIN,
1234+			  get_cycles() - start_cycles);
1235+
1236+	/* update statistics if they are enabled for this lock */
1237+	if (index > 0) {
1238+		local_irq_save(flags);
1239+		cycles64 = get_cycles64();
1240+		(*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks -=
1241+				cycles64;
1242+
1243+		/* this is not perfect (some race conditions are possible) */
1244+		if (readers_before == 0) {
1245+			(*lstat_control.read_lock_counts[cpu])[index].
1246+				start_busy = cycles64;
1247+			PUT_RW_CPU(rwlock_ptr, cpu);
1248+		}
1249+		readers_after = RWLOCK_READERS(rwlock_ptr);
1250+		if (readers_after >
1251+		    (*lstat_control.read_lock_counts[cpu])[index].max_readers)
1252+			(*lstat_control.read_lock_counts[cpu])[index].
1253+				max_readers = readers_after;
1254+		local_irq_restore(flags);
1255+	}
1256+}
1257+
1258+void
1259+_metered_read_unlock(rwlock_t * rwlock_ptr)
1260+{
1261+	int index;
1262+	int cpu;
1263+	unsigned long flags;
1264+	uint64_t busy_length;
1265+	uint64_t cycles64;
1266+
1267+	if (lstat_control.state == LSTAT_OFF) {
1268+		_raw_read_unlock(rwlock_ptr);
1269+		return;
1270+	}
1271+
1272+	index = GET_RWINDEX(rwlock_ptr);
1273+	cpu = THIS_CPU_NUMBER;
1274+
1275+	if (index > 0) {
1276+		local_irq_save(flags);
1277+		/*
1278+		 * preserve value of TSC so cum_hold_ticks and busy_ticks are
1279+		 * consistent.
1280+		 */
1281+		cycles64 = get_cycles64();
1282+		(*lstat_control.read_lock_counts[cpu])[index].cum_hold_ticks +=
1283+			cycles64;
1284+		(*lstat_control.read_lock_counts[cpu])[index].read_lock_count++;
1285+
1286+		/*
1287+		 * once again, this is not perfect (some race conditions are
1288+		 * possible)
1289+		 */
1290+		if (RWLOCK_READERS(rwlock_ptr) == 1) {
1291+			int cpu1 = GET_RW_CPU(rwlock_ptr);
1292+			uint64_t last_start_busy =
1293+				(*lstat_control.read_lock_counts[cpu1])[index].
1294+					start_busy;
1295+			(*lstat_control.read_lock_counts[cpu])[index].
1296+				busy_periods++;
1297+			if (cycles64 > last_start_busy) {
1298+				busy_length = cycles64 - last_start_busy;
1299+				(*lstat_control.read_lock_counts[cpu])[index].
1300+					busy_ticks += busy_length;
1301+				if (busy_length >
1302+					(*lstat_control.
1303+						read_lock_counts[cpu])[index].
1304+							max_busy)
1305+					(*lstat_control.
1306+					 read_lock_counts[cpu])[index].
1307+						max_busy = busy_length;
1308+			}
1309+		}
1310+		local_irq_restore(flags);
1311+	}
1312+	_raw_read_unlock(rwlock_ptr);
1313+}
1314+
1315+void
1316+_metered_write_lock(rwlock_t * rwlock_ptr, void *caller_pc)
1317+{
1318+	uint32_t start_cycles;
1319+	void *this_pc;
1320+	uint32_t spin_ticks = 0; /* in anticipation of a potential wait */
1321+	int index;
1322+	int write_index = 0;
1323+	int cpu;
1324+	enum {
1325+		writer_writer_conflict,
1326+		writer_reader_conflict
1327+	} why_wait = writer_writer_conflict;
1328+
1329+	if (lstat_control.state == LSTAT_OFF) {
1330+		_raw_write_lock(rwlock_ptr);
1331+		/* clean index in case lockmetering turns on before an unlock */
1332+		PUT_RWINDEX(rwlock_ptr, 0);
1333+		return;
1334+	}
1335+
1336+	this_pc = LSTAT_RA(LSTAT_RA_WRITE);
1337+	cpu = THIS_CPU_NUMBER;
1338+	index = GET_RWINDEX(rwlock_ptr);
1339+
1340+	/* allocate the global stats entry for this lock, if needed */
1341+	if (index == 0) {
1342+		index = alloc_rwlock_struct(rwlock_ptr);
1343+	}
1344+
1345+	if (_raw_write_trylock(rwlock_ptr)) {
1346+		/* We acquired the lock on the first try */
1347+		write_index = lstat_update((void *) rwlock_ptr, this_pc,
1348+					LSTAT_ACT_NO_WAIT);
1349+		/* save the write_index for use in unlock if stats enabled */
1350+		if (index > 0)
1351+			(*lstat_control.read_lock_counts[cpu])[index].
1352+				write_index = write_index;
1353+		return;
1354+	}
1355+
1356+	/* If we get here, then we could not quickly grab the write lock */
1357+	start_cycles = get_cycles();	/* start counting the wait time */
1358+
1359+	why_wait = RWLOCK_READERS(rwlock_ptr) ?
1360+			writer_reader_conflict : writer_writer_conflict;
1361+
1362+	/* Now set the lock and wait for conflicts to disappear */
1363+	_raw_write_lock(rwlock_ptr);
1364+
1365+	spin_ticks = get_cycles() - start_cycles;
1366+
1367+	/* update stats -- if enabled */
1368+	if (index > 0 && spin_ticks) {
1369+		if (why_wait == writer_reader_conflict) {
1370+			/* waited due to a reader holding the lock */
1371+			write_index = lstat_update_time((void *)rwlock_ptr,
1372+					this_pc, LSTAT_ACT_SPIN, spin_ticks);
1373+		} else {
1374+			/*
1375+			 * waited due to another writer holding the lock
1376+			 */
1377+			write_index = lstat_update_time((void *)rwlock_ptr,
1378+				this_pc, LSTAT_ACT_WW_SPIN, spin_ticks);
1379+			(*lstat_control.counts[cpu])[write_index].
1380+				cum_wait_ww_ticks += spin_ticks;
1381+			if (spin_ticks >
1382+				(*lstat_control.counts[cpu])[write_index].
1383+					max_wait_ww_ticks) {
1384+				(*lstat_control.counts[cpu])[write_index].
1385+					max_wait_ww_ticks = spin_ticks;
1386+			}
1387+		}
1388+
1389+		/* save the directory index for use on write_unlock */
1390+		(*lstat_control.read_lock_counts[cpu])[index].
1391+			write_index = write_index;
1392+	}
1393+}
1394+
1395+void
1396+_metered_write_unlock(rwlock_t * rwlock_ptr)
1397+{
1398+	int index;
1399+	int cpu;
1400+	int write_index;
1401+	uint32_t hold_time;
1402+
1403+	if (lstat_control.state == LSTAT_OFF) {
1404+		_raw_write_unlock(rwlock_ptr);
1405+		return;
1406+	}
1407+
1408+	cpu = THIS_CPU_NUMBER;
1409+	index = GET_RWINDEX(rwlock_ptr);
1410+
1411+	/* update statistics if stats enabled for this lock */
1412+	if (index > 0) {
1413+		write_index =
1414+		    (*lstat_control.read_lock_counts[cpu])[index].write_index;
1415+
1416+		hold_time = get_cycles() -
1417+			(*lstat_control.counts[cpu])[write_index].acquire_time;
1418+		(*lstat_control.counts[cpu])[write_index].cum_hold_ticks +=
1419+			(uint64_t) hold_time;
1420+		if ((*lstat_control.counts[cpu])[write_index].max_hold_ticks <
1421+				hold_time)
1422+			(*lstat_control.counts[cpu])[write_index].
1423+				max_hold_ticks = hold_time;
1424+	}
1425+	_raw_write_unlock(rwlock_ptr);
1426+}
1427+
1428+int
1429+_metered_write_trylock(rwlock_t * rwlock_ptr, void *caller_pc)
1430+{
1431+	int retval;
1432+	void *this_pc = LSTAT_RA(LSTAT_RA_WRITE);
1433+
1434+	if ((retval = _raw_write_trylock(rwlock_ptr))) {
1435+		lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_NO_WAIT);
1436+	} else {
1437+		lstat_update(rwlock_ptr, this_pc, LSTAT_ACT_REJECT);
1438+	}
1439+
1440+	return retval;
1441+}
1442+
1443+static void
1444+init_control_space(void)
1445+{
1446+	/* Set all control space pointers to null and indices to "empty" */
1447+	int cpu;
1448+
1449+	/*
1450+	 * Access CPU_CYCLE_FREQUENCY at the outset, which in some
1451+	 * architectures may trigger a runtime calculation that uses a
1452+	 * spinlock.  Let's do this before lockmetering is turned on.
1453+	 */
1454+	if (CPU_CYCLE_FREQUENCY == 0)
1455+		BUG();
1456+
1457+	lstat_control.hashtab = NULL;
1458+	lstat_control.dir = NULL;
1459+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1460+		lstat_control.counts[cpu] = NULL;
1461+		lstat_control.read_lock_counts[cpu] = NULL;
1462+	}
1463+}
1464+
1465+static int
1466+reset_lstat_data(void)
1467+{
1468+	int cpu, flags;
1469+
1470+	flags = 0;
1471+	lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
1472+	lstat_control.next_free_read_lock_index = 1;
1473+	lstat_control.dir_overflow = 0;
1474+	lstat_control.rwlock_overflow = 0;
1475+
1476+	lstat_control.started_cycles64 = 0;
1477+	lstat_control.ending_cycles64 = 0;
1478+	lstat_control.enabled_cycles64 = 0;
1479+	lstat_control.first_started_time = 0;
1480+	lstat_control.started_time = 0;
1481+	lstat_control.ending_time = 0;
1482+	lstat_control.intervals = 0;
1483+
1484+	/*
1485+	 * paranoia -- in case someone does a "lockstat reset" before
1486+	 * "lockstat on"
1487+	 */
1488+	if (lstat_control.hashtab) {
1489+		bzero(lstat_control.hashtab,
1490+			LSTAT_HASH_TABLE_SIZE * sizeof (short));
1491+		bzero(lstat_control.dir, LSTAT_MAX_STAT_INDEX *
1492+				sizeof (lstat_directory_entry_t));
1493+
1494+		for_each_online_cpu(cpu) {
1495+			bzero(lstat_control.counts[cpu],
1496+				sizeof (lstat_cpu_counts_t));
1497+			bzero(lstat_control.read_lock_counts[cpu],
1498+				sizeof (lstat_read_lock_cpu_counts_t));
1499+		}
1500+	}
1501+#ifdef NOTDEF
1502+	_raw_spin_unlock(&lstat_control.directory_lock);
1503+	local_irq_restore(flags);
1504+#endif
1505+	return 1;
1506+}
1507+
1508+static void
1509+release_control_space(void)
1510+{
1511+	/*
1512+	 * Called when either (1) allocation of kmem
1513+	 * or (2) when user writes LSTAT_RELEASE to /pro/lockmeter.
1514+	 * Assume that all pointers have been initialized to zero,
1515+	 * i.e., nonzero pointers are valid addresses.
1516+	 */
1517+	int cpu;
1518+
1519+	if (lstat_control.hashtab) {
1520+		kfree(lstat_control.hashtab);
1521+		lstat_control.hashtab = NULL;
1522+	}
1523+
1524+	if (lstat_control.dir) {
1525+		vfree(lstat_control.dir);
1526+		lstat_control.dir = NULL;
1527+	}
1528+
1529+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
1530+		if (lstat_control.counts[cpu]) {
1531+			vfree(lstat_control.counts[cpu]);
1532+			lstat_control.counts[cpu] = NULL;
1533+		}
1534+		if (lstat_control.read_lock_counts[cpu]) {
1535+			kfree(lstat_control.read_lock_counts[cpu]);
1536+			lstat_control.read_lock_counts[cpu] = NULL;
1537+		}
1538+	}
1539+}
1540+
1541+int
1542+get_lockmeter_info_size(void)
1543+{
1544+	return sizeof (lstat_user_request_t)
1545+		+ num_online_cpus() * sizeof (lstat_cpu_counts_t)
1546+		+ num_online_cpus() * sizeof (lstat_read_lock_cpu_counts_t)
1547+		+ (LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t));
1548+}
1549+
1550+ssize_t
1551+get_lockmeter_info(char *buffer, size_t max_len, loff_t * last_index)
1552+{
1553+	lstat_user_request_t req;
1554+	struct timeval tv;
1555+	ssize_t next_ret_bcount;
1556+	ssize_t actual_ret_bcount = 0;
1557+	int cpu;
1558+
1559+	*last_index = 0;	/* a one-shot read */
1560+
1561+	req.lstat_version = LSTAT_VERSION;
1562+	req.state = lstat_control.state;
1563+	req.maxcpus = num_online_cpus();
1564+	req.cycleval = CPU_CYCLE_FREQUENCY;
1565+#ifdef notyet
1566+	req.kernel_magic_addr = (void *) &_etext;
1567+	req.kernel_end_addr = (void *) &_etext;
1568+#endif
1569+	req.uts = system_utsname;
1570+	req.intervals = lstat_control.intervals;
1571+
1572+	req.first_started_time = lstat_control.first_started_time;
1573+	req.started_time = lstat_control.started_time;
1574+	req.started_cycles64 = lstat_control.started_cycles64;
1575+
1576+	req.next_free_dir_index = lstat_control.next_free_dir_index;
1577+	req.next_free_read_lock_index = lstat_control.next_free_read_lock_index;
1578+	req.dir_overflow = lstat_control.dir_overflow;
1579+	req.rwlock_overflow = lstat_control.rwlock_overflow;
1580+
1581+	if (lstat_control.state == LSTAT_OFF) {
1582+		if (req.intervals == 0) {
1583+			/* mesasurement is off and no valid data present */
1584+			next_ret_bcount = sizeof (lstat_user_request_t);
1585+			req.enabled_cycles64 = 0;
1586+
1587+			if ((actual_ret_bcount + next_ret_bcount) > max_len)
1588+				return actual_ret_bcount;
1589+
1590+			copy_to_user(buffer, (void *) &req, next_ret_bcount);
1591+			actual_ret_bcount += next_ret_bcount;
1592+			return actual_ret_bcount;
1593+		} else {
1594+			/*
1595+			 * measurement is off but valid data present
1596+			 * fetch time info from lstat_control
1597+			 */
1598+			req.ending_time = lstat_control.ending_time;
1599+			req.ending_cycles64 = lstat_control.ending_cycles64;
1600+			req.enabled_cycles64 = lstat_control.enabled_cycles64;
1601+		}
1602+	} else {
1603+		/*
1604+		 * this must be a read while data active--use current time,
1605+		 * etc
1606+		 */
1607+		do_gettimeofday(&tv);
1608+		req.ending_time = tv.tv_sec;
1609+		req.ending_cycles64 = get_cycles64();
1610+		req.enabled_cycles64 = req.ending_cycles64 -
1611+			req.started_cycles64 + lstat_control.enabled_cycles64;
1612+	}
1613+
1614+	next_ret_bcount = sizeof (lstat_user_request_t);
1615+	if ((actual_ret_bcount + next_ret_bcount) > max_len)
1616+		return actual_ret_bcount;
1617+
1618+	copy_to_user(buffer, (void *) &req, next_ret_bcount);
1619+	actual_ret_bcount += next_ret_bcount;
1620+
1621+	if (!lstat_control.counts[0])	/* not initialized? */
1622+		return actual_ret_bcount;
1623+
1624+	next_ret_bcount = sizeof (lstat_cpu_counts_t);
1625+	for_each_online_cpu(cpu) {
1626+		if ((actual_ret_bcount + next_ret_bcount) > max_len)
1627+			return actual_ret_bcount;	/* leave early */
1628+		copy_to_user(buffer + actual_ret_bcount,
1629+				lstat_control.counts[cpu], next_ret_bcount);
1630+		actual_ret_bcount += next_ret_bcount;
1631+	}
1632+
1633+	next_ret_bcount = LSTAT_MAX_STAT_INDEX *
1634+			sizeof (lstat_directory_entry_t);
1635+	if (((actual_ret_bcount + next_ret_bcount) > max_len)
1636+			|| !lstat_control.dir)
1637+		return actual_ret_bcount;	/* leave early */
1638+
1639+	copy_to_user(buffer + actual_ret_bcount, lstat_control.dir,
1640+			next_ret_bcount);
1641+	actual_ret_bcount += next_ret_bcount;
1642+
1643+	next_ret_bcount = sizeof (lstat_read_lock_cpu_counts_t);
1644+	for_each_online_cpu(cpu) {
1645+		if (actual_ret_bcount + next_ret_bcount > max_len)
1646+			return actual_ret_bcount;
1647+		copy_to_user(buffer + actual_ret_bcount,
1648+				lstat_control.read_lock_counts[cpu],
1649+				next_ret_bcount);
1650+		actual_ret_bcount += next_ret_bcount;
1651+	}
1652+
1653+	return actual_ret_bcount;
1654+}
1655+
1656+/*
1657+ *  Writing to the /proc lockmeter node enables or disables metering.
1658+ *  based upon the first byte of the "written" data.
1659+ *  The following values are defined:
1660+ *  LSTAT_ON: 1st call: allocates storage, intializes and turns on measurement
1661+ *            subsequent calls just turn on measurement
1662+ *  LSTAT_OFF: turns off measurement
1663+ *  LSTAT_RESET: resets statistics
1664+ *  LSTAT_RELEASE: releases statistics storage
1665+ *
1666+ *  This allows one to accumulate statistics over several lockstat runs:
1667+ *
1668+ *  lockstat on
1669+ *  lockstat off
1670+ *  ...repeat above as desired...
1671+ *  lockstat get
1672+ *  ...now start a new set of measurements...
1673+ *  lockstat reset
1674+ *  lockstat on
1675+ *  ...
1676+ *
1677+ */
1678+ssize_t
1679+put_lockmeter_info(const char *buffer, size_t len)
1680+{
1681+	int error = 0;
1682+	int dirsize, countsize, read_lock_countsize, hashsize;
1683+	int cpu;
1684+	char put_char;
1685+	int i, read_lock_blocks;
1686+	unsigned long flags;
1687+	rwlock_t *lock_ptr;
1688+	struct timeval tv;
1689+
1690+	if (len <= 0)
1691+		return -EINVAL;
1692+
1693+	_raw_spin_lock(&lstat_control.control_lock);
1694+
1695+	get_user(put_char, buffer);
1696+	switch (put_char) {
1697+
1698+	case LSTAT_OFF:
1699+		if (lstat_control.state != LSTAT_OFF) {
1700+			/*
1701+			 * To avoid seeing read lock hold times in an
1702+			 * inconsisent state, we have to follow this protocol
1703+			 * to turn off statistics
1704+			 */
1705+			local_irq_save(flags);
1706+			/*
1707+			 * getting this lock will stop any read lock block
1708+			 * allocations
1709+			 */
1710+			_raw_spin_lock(&lstat_control.directory_lock);
1711+			/*
1712+			 * keep any more read lock blocks from being
1713+			 * allocated
1714+			 */
1715+			lstat_control.state = LSTAT_OFF;
1716+			/* record how may read lock blocks there are */
1717+			read_lock_blocks =
1718+				lstat_control.next_free_read_lock_index;
1719+			_raw_spin_unlock(&lstat_control.directory_lock);
1720+			/* now go through the list of read locks */
1721+			cpu = THIS_CPU_NUMBER;
1722+			for (i = 1; i < read_lock_blocks; i++) {
1723+				lock_ptr =
1724+				    (*lstat_control.read_lock_counts[cpu])[i].
1725+				    lock_ptr;
1726+				/* is this saved lock address still valid? */
1727+				if (GET_RWINDEX(lock_ptr) == i) {
1728+					/*
1729+					 * lock address appears to still be
1730+					 * valid because we only hold one lock
1731+					 * at a time, this can't cause a
1732+					 * deadlock unless this is a lock held
1733+					 * as part of the current system call
1734+					 * path. At the moment there
1735+					 * are no READ mode locks held to get
1736+					 * here from user space, so we solve
1737+					 * this by skipping locks held in
1738+					 * write mode.
1739+					 */
1740+					if (RWLOCK_IS_WRITE_LOCKED(lock_ptr)) {
1741+						PUT_RWINDEX(lock_ptr, 0);
1742+						continue;
1743+					}
1744+					/*
1745+					 * now we know there are no read
1746+					 * holders of this lock! stop
1747+					 * statistics collection for this
1748+					 * lock
1749+					 */
1750+					_raw_write_lock(lock_ptr);
1751+					PUT_RWINDEX(lock_ptr, 0);
1752+					_raw_write_unlock(lock_ptr);
1753+				}
1754+				/*
1755+				 * it may still be possible for the hold time
1756+				 * sum to be negative e.g. if a lock is
1757+				 * reallocated while "busy" we will have to fix
1758+				 * this up in the data reduction program.
1759+				 */
1760+			}
1761+			local_irq_restore(flags);
1762+			lstat_control.intervals++;
1763+			lstat_control.ending_cycles64 = get_cycles64();
1764+			lstat_control.enabled_cycles64 +=
1765+				lstat_control.ending_cycles64 -
1766+				lstat_control.started_cycles64;
1767+			do_gettimeofday(&tv);
1768+			lstat_control.ending_time = tv.tv_sec;
1769+			/*
1770+			 * don't deallocate the structures -- we may do a
1771+			 * lockstat on to add to the data that is already
1772+			 * there. Use LSTAT_RELEASE to release storage
1773+			 */
1774+		} else {
1775+			error = -EBUSY;	/* already OFF */
1776+		}
1777+		break;
1778+
1779+	case LSTAT_ON:
1780+		if (lstat_control.state == LSTAT_OFF) {
1781+#ifdef DEBUG_LOCKMETER
1782+			printk("put_lockmeter_info(cpu=%d): LSTAT_ON\n",
1783+				THIS_CPU_NUMBER);
1784+#endif
1785+			lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
1786+
1787+			dirsize = LSTAT_MAX_STAT_INDEX *
1788+					sizeof (lstat_directory_entry_t);
1789+			hashsize =
1790+				(1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort);
1791+			countsize = sizeof (lstat_cpu_counts_t);
1792+			read_lock_countsize =
1793+				sizeof (lstat_read_lock_cpu_counts_t);
1794+#ifdef DEBUG_LOCKMETER
1795+			printk(" dirsize:%d", dirsize);
1796+			printk(" hashsize:%d", hashsize);
1797+			printk(" countsize:%d", countsize);
1798+			printk(" read_lock_countsize:%d\n",
1799+				read_lock_countsize);
1800+#endif
1801+#ifdef DEBUG_LOCKMETER
1802+			{
1803+				int secs;
1804+				unsigned long cycles;
1805+				uint64_t cycles64;
1806+
1807+				do_gettimeofday(&tv);
1808+				secs = tv.tv_sec;
1809+				do {
1810+					do_gettimeofday(&tv);
1811+				} while (secs == tv.tv_sec);
1812+				cycles = get_cycles();
1813+				cycles64 = get_cycles64();
1814+				secs = tv.tv_sec;
1815+				do {
1816+					do_gettimeofday(&tv);
1817+				} while (secs == tv.tv_sec);
1818+				cycles = get_cycles() - cycles;
1819+				cycles64 = get_cycles64() - cycles64;
1820+				printk("lockmeter: cycleFrequency:%d "
1821+					"cycles:%d cycles64:%d\n",
1822+					CPU_CYCLE_FREQUENCY, cycles, cycles64);
1823+			}
1824+#endif
1825+
1826+			/*
1827+			 * if this is the first call, allocate storage and
1828+			 * initialize
1829+			 */
1830+			if (!lstat_control.hashtab) {
1831+
1832+				spin_lock_init(&lstat_control.directory_lock);
1833+
1834+				/* guarantee all pointers at zero */
1835+				init_control_space();
1836+
1837+				lstat_control.hashtab =
1838+				    kmalloc(hashsize, GFP_KERNEL);
1839+				if (!lstat_control.hashtab) {
1840+					error = -ENOSPC;
1841+#ifdef DEBUG_LOCKMETER
1842+					printk("!!error kmalloc of hashtab\n");
1843+#endif
1844+				}
1845+				lstat_control.dir = vmalloc(dirsize);
1846+				if (!lstat_control.dir) {
1847+					error = -ENOSPC;
1848+#ifdef DEBUG_LOCKMETER
1849+					printk("!!error kmalloc of dir\n");
1850+#endif
1851+				}
1852+
1853+				for_each_online_cpu(cpu) {
1854+					lstat_control.counts[cpu] =
1855+						vmalloc(countsize);
1856+					if (!lstat_control.counts[cpu]) {
1857+						error = -ENOSPC;
1858+#ifdef DEBUG_LOCKMETER
1859+						printk("!!error vmalloc of "
1860+							"counts[%d]\n", cpu);
1861+#endif
1862+					}
1863+					lstat_control.read_lock_counts[cpu] =
1864+						(lstat_read_lock_cpu_counts_t *)
1865+						kmalloc(read_lock_countsize,
1866+							GFP_KERNEL);
1867+					if (!lstat_control.
1868+							read_lock_counts[cpu]) {
1869+						error = -ENOSPC;
1870+#ifdef DEBUG_LOCKMETER
1871+						printk("!!error kmalloc of "
1872+						  "read_lock_counts[%d]\n",
1873+							cpu);
1874+#endif
1875+					}
1876+				}
1877+			}
1878+
1879+			if (error) {
1880+				/*
1881+				 * One or more kmalloc failures -- free
1882+				 * everything
1883+				 */
1884+				release_control_space();
1885+			} else {
1886+
1887+				if (!reset_lstat_data()) {
1888+					error = -EINVAL;
1889+					break;
1890+				};
1891+
1892+				/*
1893+				 * record starting and ending times and the
1894+				 * like
1895+				 */
1896+				if (lstat_control.intervals == 0) {
1897+					do_gettimeofday(&tv);
1898+					lstat_control.first_started_time =
1899+						tv.tv_sec;
1900+				}
1901+				lstat_control.started_cycles64 = get_cycles64();
1902+				do_gettimeofday(&tv);
1903+				lstat_control.started_time = tv.tv_sec;
1904+
1905+				lstat_control.state = LSTAT_ON;
1906+			}
1907+		} else {
1908+			error = -EBUSY;	/* already ON */
1909+		}
1910+		break;
1911+
1912+	case LSTAT_RESET:
1913+		if (lstat_control.state == LSTAT_OFF) {
1914+			if (!reset_lstat_data())
1915+				error = -EINVAL;
1916+		} else {
1917+			error = -EBUSY;	/* still on; can't reset */
1918+		}
1919+		break;
1920+
1921+	case LSTAT_RELEASE:
1922+		if (lstat_control.state == LSTAT_OFF) {
1923+			release_control_space();
1924+			lstat_control.intervals = 0;
1925+			lstat_control.enabled_cycles64 = 0;
1926+		} else {
1927+			error = -EBUSY;
1928+		}
1929+		break;
1930+
1931+	default:
1932+		error = -EINVAL;
1933+	}			/* switch */
1934+
1935+	_raw_spin_unlock(&lstat_control.control_lock);
1936+	return error ? error : len;
1937+}
1938+
1939+#ifdef USER_MODE_TESTING
1940+/* following used for user mode testing */
1941+void
1942+lockmeter_init()
1943+{
1944+	int dirsize, hashsize, countsize, read_lock_countsize, cpu;
1945+
1946+	printf("lstat_control is at %x size=%d\n", &lstat_control,
1947+		sizeof (lstat_control));
1948+	printf("sizeof(spinlock_t)=%d\n", sizeof (spinlock_t));
1949+	lstat_control.state = LSTAT_ON;
1950+
1951+	lstat_control.directory_lock = SPIN_LOCK_UNLOCKED;
1952+	lstat_control.next_free_dir_index = 1;	/* 0 is for overflows */
1953+	lstat_control.next_free_read_lock_index = 1;
1954+
1955+	dirsize = LSTAT_MAX_STAT_INDEX * sizeof (lstat_directory_entry_t);
1956+	hashsize = (1 + LSTAT_HASH_TABLE_SIZE) * sizeof (ushort);
1957+	countsize = sizeof (lstat_cpu_counts_t);
1958+	read_lock_countsize = sizeof (lstat_read_lock_cpu_counts_t);
1959+
1960+	lstat_control.hashtab = (ushort *) malloc(hashsize);
1961+
1962+	if (lstat_control.hashtab == 0) {
1963+		printf("malloc failure for at line %d in lockmeter.c\n",
1964+			__LINE__);
1965+		exit(0);
1966+	}
1967+
1968+	lstat_control.dir = (lstat_directory_entry_t *) malloc(dirsize);
1969+
1970+	if (lstat_control.dir == 0) {
1971+		printf("malloc failure for at line %d in lockmeter.c\n", cpu,
1972+			__LINE__);
1973+		exit(0);
1974+	}
1975+
1976+	for_each_online_cpu(cpu) {
1977+		int j, k;
1978+		j = (int) (lstat_control.counts[cpu] =
1979+			   (lstat_cpu_counts_t *) malloc(countsize));
1980+		k = (int) (lstat_control.read_lock_counts[cpu] =
1981+			   (lstat_read_lock_cpu_counts_t *)
1982+			   malloc(read_lock_countsize));
1983+		if (j * k == 0) {
1984+			printf("malloc failure for cpu=%d at line %d in "
1985+				"lockmeter.c\n", cpu, __LINE__);
1986+			exit(0);
1987+		}
1988+	}
1989+
1990+	memset(lstat_control.hashtab, 0, hashsize);
1991+	memset(lstat_control.dir, 0, dirsize);
1992+
1993+	for_each_online_cpu(cpu) {
1994+		memset(lstat_control.counts[cpu], 0, countsize);
1995+		memset(lstat_control.read_lock_counts[cpu], 0,
1996+			read_lock_countsize);
1997+	}
1998+}
1999+#endif
2000+
2001+#ifdef CONFIG_PREEMPT
2002+xxx huh??
2003+#endif
2004+#if defined(CONFIG_PREEMPT)
2005+xxx huh??
2006+#endif
2007+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
2008+xxx huh??
2009+#endif
2010+
2011+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
2012+xxxx lockmeter cannot support CONFIG_PREEMPT now
2013+/*
2014+ * This could be a long-held lock.  If another CPU holds it for a long time,
2015+ * and that CPU is not asked to reschedule then *this* CPU will spin on the
2016+ * lock for a long time, even if *this* CPU is asked to reschedule.
2017+ *
2018+ * So what we do here, in the slow (contended) path is to spin on the lock by
2019+ * hand while permitting preemption.
2020+ *
2021+ * Called inside preempt_disable().
2022+ */
2023+static inline void __preempt_spin_lock(spinlock_t *lock, void *caller_pc)
2024+{
2025+	if (preempt_count() > 1) {
2026+		_metered_spin_lock(lock, caller_pc);
2027+		return;
2028+	}
2029+
2030+	do {
2031+		preempt_enable();
2032+		while (spin_is_locked(lock))
2033+			cpu_relax();
2034+		preempt_disable();
2035+	} while (!_metered_spin_trylock(lock, caller_pc));
2036+}
2037+
2038+void __lockfunc _spin_lock(spinlock_t *lock)
2039+{
2040+	preempt_disable();
2041+	if (unlikely(!_metered_spin_trylock(lock, __builtin_return_address(0))))
2042+		__preempt_spin_lock(lock, __builtin_return_address(0));
2043+}
2044+
2045+static inline void __preempt_write_lock(rwlock_t *lock, void *caller_pc)
2046+{
2047+	if (preempt_count() > 1) {
2048+		_metered_write_lock(lock, caller_pc);
2049+		return;
2050+	}
2051+
2052+	do {
2053+		preempt_enable();
2054+		while (rwlock_is_locked(lock))
2055+			cpu_relax();
2056+		preempt_disable();
2057+	} while (!_metered_write_trylock(lock,caller_pc));
2058+}
2059+
2060+void __lockfunc _write_lock(rwlock_t *lock)
2061+{
2062+	preempt_disable();
2063+	if (unlikely(!_metered_write_trylock(lock, __builtin_return_address(0))))
2064+		__preempt_write_lock(lock, __builtin_return_address(0));
2065+}
2066+#endif
2067Index: linux/arch/sparc64/Kconfig.debug
2068===================================================================
2069--- linux.orig/arch/sparc64/Kconfig.debug	2005-11-18 14:58:48.000000000 -0800
2070+++ linux/arch/sparc64/Kconfig.debug	2005-11-18 14:59:31.000000000 -0800
2071@@ -41,6 +41,13 @@
2072 	  This results in a large slowdown, but helps to find certain types
2073 	  of memory corruptions.
2074
2075+config LOCKMETER
2076+	bool "Kernel lock metering"
2077+	depends on SMP && !PREEMPT
2078+	help
2079+	  Say Y to enable kernel lock metering, which adds overhead to SMP locks,
2080+	  but allows you to see various statistics using the lockstat command.
2081+
2082 config MCOUNT
2083 	bool
2084 	depends on STACK_DEBUG
2085Index: linux/arch/x86_64/Kconfig.debug
2086===================================================================
2087--- linux.orig/arch/x86_64/Kconfig.debug	2005-11-18 14:58:48.000000000 -0800
2088+++ linux/arch/x86_64/Kconfig.debug	2005-11-18 14:59:31.000000000 -0800
2089@@ -54,4 +54,11 @@
2090 #config X86_REMOTE_DEBUG
2091 #       bool "kgdb debugging stub"
2092
2093+config LOCKMETER
2094+	bool "Kernel lock metering"
2095+	depends on SMP && !PREEMPT
2096+	help
2097+	  Say Y to enable kernel lock metering, which adds overhead to SMP locks,
2098+	  but allows you to see various statistics using the lockstat command.
2099+
2100 endmenu
2101Index: linux/kernel/Makefile
2102===================================================================
2103--- linux.orig/kernel/Makefile	2005-11-18 14:58:48.000000000 -0800
2104+++ linux/kernel/Makefile	2005-11-18 14:59:31.000000000 -0800
2105@@ -13,6 +13,7 @@
2106 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
2107 obj-$(CONFIG_SMP) += cpu.o spinlock.o
2108 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
2109+obj-$(CONFIG_LOCKMETER) += lockmeter.o
2110 obj-$(CONFIG_UID16) += uid16.o
2111 obj-$(CONFIG_MODULES) += module.o
2112 obj-$(CONFIG_KALLSYMS) += kallsyms.o
2113Index: linux/include/linux/spinlock_types.h
2114===================================================================
2115--- linux.orig/include/linux/spinlock_types.h	2005-11-18 14:58:48.000000000 -0800
2116+++ linux/include/linux/spinlock_types.h	2005-11-18 14:59:31.000000000 -0800
2117@@ -17,6 +17,11 @@
2118
2119 typedef struct {
2120 	raw_spinlock_t raw_lock;
2121+#if defined(CONFIG_LOCKMETER)
2122+	/* required for LOCKMETER since all bits in lock    */
2123+	/* are used and we need this storage for lock INDEX */
2124+	volatile unsigned int index;
2125+#endif
2126 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
2127 	unsigned int break_lock;
2128 #endif
2129@@ -33,6 +38,10 @@
2130 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
2131 	unsigned int break_lock;
2132 #endif
2133+#ifdef CONFIG_LOCKMETER
2134+	volatile unsigned short index;
2135+	volatile unsigned short cpu;
2136+#endif
2137 #ifdef CONFIG_DEBUG_SPINLOCK
2138 	unsigned int magic, owner_cpu;
2139 	void *owner;
2140@@ -55,13 +64,64 @@
2141 				.owner = SPINLOCK_OWNER_INIT,		\
2142 				.owner_cpu = -1 }
2143 #else
2144+#ifdef CONFIG_LOCKMETER
2145+# define SPIN_LOCK_UNLOCKED \
2146+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
2147+				.index	  = 0 }
2148+#define RW_LOCK_UNLOCKED \
2149+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
2150+				.cpu	  = 0,				\
2151+				.index	  = 0 }
2152+#else
2153 # define SPIN_LOCK_UNLOCKED \
2154 	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
2155 #define RW_LOCK_UNLOCKED \
2156 	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
2157 #endif
2158+#endif
2159
2160 #define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
2161 #define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
2162
2163+#ifdef CONFIG_LOCKMETER
2164+extern void _metered_spin_lock(spinlock_t *, void *);
2165+extern int  _metered_spin_trylock(spinlock_t *, void *);
2166+extern void _metered_spin_lock_flags(spinlock_t *, unsigned long *, void *);
2167+extern int  _metered_write_trylock(rwlock_t *, void *);
2168+extern void _metered_read_lock(rwlock_t *, void *);
2169+extern void _metered_write_lock(rwlock_t *, void *);
2170+extern void _metered_spin_unlock(spinlock_t *);
2171+extern void _metered_read_unlock(rwlock_t *);
2172+extern void _metered_write_unlock(rwlock_t *);
2173+
2174+#define _do_raw_spin_lock(lock)	\
2175+		_metered_spin_lock(lock, __builtin_return_address(0))
2176+#define _do_raw_spin_lock_flags(lock, flagsp)	\
2177+	    _metered_spin_lock_flags(lock, flagsp, __builtin_return_address(0))
2178+#define _do_raw_spin_trylock(lock)	\
2179+		_metered_spin_trylock(lock, __builtin_return_address(0))
2180+#define _do_raw_read_trylock(lock)	_raw_read_trylock(lock)
2181+#define _do_raw_write_trylock(lock)	\
2182+		_metered_write_trylock(lock, __builtin_return_address(0))
2183+#define _do_raw_read_lock(lock)	\
2184+		_metered_read_lock(lock, __builtin_return_address(0))
2185+#define _do_raw_write_lock(lock)	\
2186+		_metered_write_lock(lock, __builtin_return_address(0))
2187+#define _do_raw_spin_unlock(lock)	_metered_spin_unlock(lock)
2188+#define _do_raw_read_unlock(lock)	_metered_read_unlock(lock)
2189+#define _do_raw_write_unlock(lock)	_metered_write_unlock(lock)
2190+#else
2191+#define _do_raw_spin_lock(lock)		_raw_spin_lock(lock)
2192+#define _do_raw_spin_lock_flags(lock,flagsp)	\
2193+					_raw_spin_lock_flags(lock, flagsp)
2194+#define _do_raw_spin_trylock(lock)	_raw_spin_trylock(lock)
2195+#define _do_raw_read_trylock(lock)	_raw_read_trylock(lock)
2196+#define _do_raw_write_trylock(lock)	_raw_write_trylock(lock)
2197+#define _do_raw_read_lock(lock)		_raw_read_lock(lock)
2198+#define _do_raw_write_lock(lock)	_raw_write_lock(lock)
2199+#define _do_raw_spin_unlock(lock)	_raw_spin_unlock(lock)
2200+#define _do_raw_read_unlock(lock)	_raw_read_unlock(lock)
2201+#define _do_raw_write_unlock(lock)	_raw_write_unlock(lock)
2202+#endif
2203+
2204 #endif /* __LINUX_SPINLOCK_TYPES_H */
2205Index: linux/kernel/spinlock.c
2206===================================================================
2207--- linux.orig/kernel/spinlock.c	2005-11-18 14:58:48.000000000 -0800
2208+++ linux/kernel/spinlock.c	2005-11-18 14:59:31.000000000 -0800
2209@@ -30,7 +30,7 @@
2210 int __lockfunc _spin_trylock(spinlock_t *lock)
2211 {
2212 	preempt_disable();
2213-	if (_raw_spin_trylock(lock))
2214+	if (_do_raw_spin_trylock(lock))
2215 		return 1;
2216
2217 	preempt_enable();
2218@@ -41,7 +41,7 @@
2219 int __lockfunc _read_trylock(rwlock_t *lock)
2220 {
2221 	preempt_disable();
2222-	if (_raw_read_trylock(lock))
2223+	if (_do_raw_read_trylock(lock))
2224 		return 1;
2225
2226 	preempt_enable();
2227@@ -52,7 +52,7 @@
2228 int __lockfunc _write_trylock(rwlock_t *lock)
2229 {
2230 	preempt_disable();
2231-	if (_raw_write_trylock(lock))
2232+	if (_do_raw_write_trylock(lock))
2233 		return 1;
2234
2235 	preempt_enable();
2236@@ -65,7 +65,7 @@
2237 void __lockfunc _read_lock(rwlock_t *lock)
2238 {
2239 	preempt_disable();
2240-	_raw_read_lock(lock);
2241+	_do_raw_read_lock(lock);
2242 }
2243 EXPORT_SYMBOL(_read_lock);
2244
2245@@ -75,7 +75,7 @@
2246
2247 	local_irq_save(flags);
2248 	preempt_disable();
2249-	_raw_spin_lock_flags(lock, &flags);
2250+	_do_raw_spin_lock_flags(lock, &flags);
2251 	return flags;
2252 }
2253 EXPORT_SYMBOL(_spin_lock_irqsave);
2254@@ -84,7 +84,7 @@
2255 {
2256 	local_irq_disable();
2257 	preempt_disable();
2258-	_raw_spin_lock(lock);
2259+	_do_raw_spin_lock(lock);
2260 }
2261 EXPORT_SYMBOL(_spin_lock_irq);
2262
2263@@ -92,7 +92,7 @@
2264 {
2265 	local_bh_disable();
2266 	preempt_disable();
2267-	_raw_spin_lock(lock);
2268+	_do_raw_spin_lock(lock);
2269 }
2270 EXPORT_SYMBOL(_spin_lock_bh);
2271
2272@@ -102,7 +102,7 @@
2273
2274 	local_irq_save(flags);
2275 	preempt_disable();
2276-	_raw_read_lock(lock);
2277+	_do_raw_read_lock(lock);
2278 	return flags;
2279 }
2280 EXPORT_SYMBOL(_read_lock_irqsave);
2281@@ -111,7 +111,7 @@
2282 {
2283 	local_irq_disable();
2284 	preempt_disable();
2285-	_raw_read_lock(lock);
2286+	_do_raw_read_lock(lock);
2287 }
2288 EXPORT_SYMBOL(_read_lock_irq);
2289
2290@@ -119,7 +119,7 @@
2291 {
2292 	local_bh_disable();
2293 	preempt_disable();
2294-	_raw_read_lock(lock);
2295+	_do_raw_read_lock(lock);
2296 }
2297 EXPORT_SYMBOL(_read_lock_bh);
2298
2299@@ -129,7 +129,7 @@
2300
2301 	local_irq_save(flags);
2302 	preempt_disable();
2303-	_raw_write_lock(lock);
2304+	_do_raw_write_lock(lock);
2305 	return flags;
2306 }
2307 EXPORT_SYMBOL(_write_lock_irqsave);
2308@@ -138,7 +138,7 @@
2309 {
2310 	local_irq_disable();
2311 	preempt_disable();
2312-	_raw_write_lock(lock);
2313+	_do_raw_write_lock(lock);
2314 }
2315 EXPORT_SYMBOL(_write_lock_irq);
2316
2317@@ -146,14 +146,14 @@
2318 {
2319 	local_bh_disable();
2320 	preempt_disable();
2321-	_raw_write_lock(lock);
2322+	_do_raw_write_lock(lock);
2323 }
2324 EXPORT_SYMBOL(_write_lock_bh);
2325
2326 void __lockfunc _spin_lock(spinlock_t *lock)
2327 {
2328 	preempt_disable();
2329-	_raw_spin_lock(lock);
2330+	_do_raw_spin_lock(lock);
2331 }
2332
2333 EXPORT_SYMBOL(_spin_lock);
2334@@ -161,7 +161,7 @@
2335 void __lockfunc _write_lock(rwlock_t *lock)
2336 {
2337 	preempt_disable();
2338-	_raw_write_lock(lock);
2339+	_do_raw_write_lock(lock);
2340 }
2341
2342 EXPORT_SYMBOL(_write_lock);
2343@@ -259,28 +259,28 @@
2344
2345 void __lockfunc _spin_unlock(spinlock_t *lock)
2346 {
2347-	_raw_spin_unlock(lock);
2348+	_do_raw_spin_unlock(lock);
2349 	preempt_enable();
2350 }
2351 EXPORT_SYMBOL(_spin_unlock);
2352
2353 void __lockfunc _write_unlock(rwlock_t *lock)
2354 {
2355-	_raw_write_unlock(lock);
2356+	_do_raw_write_unlock(lock);
2357 	preempt_enable();
2358 }
2359 EXPORT_SYMBOL(_write_unlock);
2360
2361 void __lockfunc _read_unlock(rwlock_t *lock)
2362 {
2363-	_raw_read_unlock(lock);
2364+	_do_raw_read_unlock(lock);
2365 	preempt_enable();
2366 }
2367 EXPORT_SYMBOL(_read_unlock);
2368
2369 void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
2370 {
2371-	_raw_spin_unlock(lock);
2372+	_do_raw_spin_unlock(lock);
2373 	local_irq_restore(flags);
2374 	preempt_enable();
2375 }
2376@@ -288,7 +288,7 @@
2377
2378 void __lockfunc _spin_unlock_irq(spinlock_t *lock)
2379 {
2380-	_raw_spin_unlock(lock);
2381+	_do_raw_spin_unlock(lock);
2382 	local_irq_enable();
2383 	preempt_enable();
2384 }
2385@@ -296,7 +296,7 @@
2386
2387 void __lockfunc _spin_unlock_bh(spinlock_t *lock)
2388 {
2389-	_raw_spin_unlock(lock);
2390+	_do_raw_spin_unlock(lock);
2391 	preempt_enable_no_resched();
2392 	local_bh_enable();
2393 }
2394@@ -304,7 +304,7 @@
2395
2396 void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
2397 {
2398-	_raw_read_unlock(lock);
2399+	_do_raw_read_unlock(lock);
2400 	local_irq_restore(flags);
2401 	preempt_enable();
2402 }
2403@@ -312,7 +312,7 @@
2404
2405 void __lockfunc _read_unlock_irq(rwlock_t *lock)
2406 {
2407-	_raw_read_unlock(lock);
2408+	_do_raw_read_unlock(lock);
2409 	local_irq_enable();
2410 	preempt_enable();
2411 }
2412@@ -320,7 +320,7 @@
2413
2414 void __lockfunc _read_unlock_bh(rwlock_t *lock)
2415 {
2416-	_raw_read_unlock(lock);
2417+	_do_raw_read_unlock(lock);
2418 	preempt_enable_no_resched();
2419 	local_bh_enable();
2420 }
2421@@ -328,7 +328,7 @@
2422
2423 void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
2424 {
2425-	_raw_write_unlock(lock);
2426+	_do_raw_write_unlock(lock);
2427 	local_irq_restore(flags);
2428 	preempt_enable();
2429 }
2430@@ -336,7 +336,7 @@
2431
2432 void __lockfunc _write_unlock_irq(rwlock_t *lock)
2433 {
2434-	_raw_write_unlock(lock);
2435+	_do_raw_write_unlock(lock);
2436 	local_irq_enable();
2437 	preempt_enable();
2438 }
2439@@ -344,7 +344,7 @@
2440
2441 void __lockfunc _write_unlock_bh(rwlock_t *lock)
2442 {
2443-	_raw_write_unlock(lock);
2444+	_do_raw_write_unlock(lock);
2445 	preempt_enable_no_resched();
2446 	local_bh_enable();
2447 }
2448@@ -354,7 +354,7 @@
2449 {
2450 	local_bh_disable();
2451 	preempt_disable();
2452-	if (_raw_spin_trylock(lock))
2453+	if (_do_raw_spin_trylock(lock))
2454 		return 1;
2455
2456 	preempt_enable_no_resched();
2457Index: linux/arch/ppc64/Kconfig.debug
2458===================================================================
2459--- linux.orig/arch/ppc64/Kconfig.debug	2005-11-18 14:58:48.000000000 -0800
2460+++ linux/arch/ppc64/Kconfig.debug	2005-11-18 14:59:31.000000000 -0800
2461@@ -19,6 +19,13 @@
2462 	  for kernel debugging, non-intrusive instrumentation and testing.
2463 	  If in doubt, say "N".
2464
2465+config LOCKMETER
2466+	bool "Kernel lock metering"
2467+	depends on SMP
2468+	help
2469+	  Say Y to enable kernel lock metering, which adds overhead to SMP locks,
2470+	  but allows you to see various statistics using the lockstat command.
2471+
2472 config DEBUG_STACK_USAGE
2473 	bool "Stack utilization instrumentation"
2474 	depends on DEBUG_KERNEL
2475Index: linux/include/asm-ppc64/lockmeter.h
2476===================================================================
2477--- /dev/null	1970-01-01 00:00:00.000000000 +0000
2478+++ linux/include/asm-ppc64/lockmeter.h	2005-11-18 14:59:31.000000000 -0800
2479@@ -0,0 +1,71 @@
2480+/*
2481+ *  Copyright (C) 1999,2000 Silicon Graphics, Inc.
2482+ *
2483+ *  Written by John Hawkes (hawkes@sgi.com)
2484+ *  Based on klstat.h by Jack Steiner (steiner@sgi.com)
2485+ *
2486+ *  Modified by Ray Bryant (raybry@us.ibm.com)
2487+ *  Changes Copyright (C) 2000 IBM, Inc.
2488+ *  Added save of index in spinlock_t to improve efficiency
2489+ *  of "hold" time reporting for spinlocks.
2490+ *  Added support for hold time statistics for read and write
2491+ *  locks.
2492+ *  Moved machine dependent code here from include/lockmeter.h.
2493+ *
2494+ *  Modified by Tony Garcia (garcia1@us.ibm.com)
2495+ *  Ported to Power PC 64
2496+ */
2497+
2498+#ifndef _PPC64_LOCKMETER_H
2499+#define _PPC64_LOCKMETER_H
2500+
2501+
2502+#include <asm/spinlock.h>
2503+#include <linux/version.h>
2504+#include <linux/cpufreq.h>
2505+
2506+#include <asm/processor.h>   /* definitions for SPRN_TBRL
2507+                                SPRN_TBRU, mftb()  */
2508+extern unsigned long ppc_proc_freq;
2509+
2510+#define CPU_CYCLE_FREQUENCY ppc_proc_freq
2511+
2512+#define THIS_CPU_NUMBER    smp_processor_id()
2513+
2514+/*
2515+ * return the number of readers for a rwlock_t
2516+ */
2517+#define RWLOCK_READERS(rwlock_ptr)   rwlock_readers(rwlock_ptr)
2518+
2519+/* Return number of readers */
2520+extern inline int rwlock_readers(rwlock_t *rwlock_ptr)
2521+{
2522+	signed int tmp = rwlock_ptr->lock;
2523+
2524+	if ( tmp > 0 )
2525+		return tmp;
2526+	else
2527+	return 0;
2528+}
2529+
2530+/*
2531+ * return true if rwlock is write locked
2532+ * (note that other lock attempts can cause the lock value to be negative)
2533+ */
2534+#define RWLOCK_IS_WRITE_LOCKED(rwlock_ptr) ((signed int)(rwlock_ptr)->lock < 0)
2535+#define RWLOCK_IS_READ_LOCKED(rwlock_ptr)  ((signed int)(rwlock_ptr)->lock > 0 )
2536+
2537+/*Written by Carl L. to get the time base counters on ppc,
2538+  rplaces the Intel only call rtds*/
2539+static inline long get_cycles64 (void)
2540+{
2541+	unsigned long tb;
2542+
2543+	/* read the upper and lower 32 bit Time base counter */
2544+	tb = mfspr(SPRN_TBRU);
2545+	tb = (tb << 32)  | mfspr(SPRN_TBRL);
2546+
2547+	return(tb);
2548+}
2549+
2550+#endif /* _PPC64_LOCKMETER_H */
2551