[PATCH] POSIX clocks & timers

This is version 23 or so of the POSIX timer code. Internal changelog: - Changed the signals code to match the new order of things. Also the new xtime_lock code needed to be picked up. It made some things a lot simpler. - Fixed a spin lock hand off problem in locking timers (thanks to Randy). - Fixed nanosleep to test for out of bound nanoseconds (thanks to Julie). - Fixed a couple of id deallocation bugs that left old ids laying around (hey I get this one). - This version has a new timer id manager. Andrew Morton suggested elimination of recursion (done) and I added code to allow it to release unused nodes. The prior version only released the leaf nodes. (The id manager uses radix tree type nodes.) Also added is a reuse count so ids will not repeat for at least 256 alloc/ free cycles. - The changes for the new sys_call restart now allow one restart function to handle both nanosleep and clock_nanosleep. Saves a bit of code, nice. - All the requested changes and Lindent too :). - I also broke clock_nanosleep() apart much the same way nanosleep() was with the 2.5.50-bk5 changes. TIMER STORMS The POSIX clocks and timers code prevents "timer storms" by not putting repeating timers back in the timer list until the signal is delivered for the prior expiry. Timer events missed by this delay are accounted for in the timer overrun count. The net result is MUCH lower system overhead while presenting the same info to the user as would be the case if an interrupt and timer processing were required for each increment in the overrun count.

[PATCH] POSIX clocks & timers
This is version 23 or so of the POSIX timer code. Internal changelog: - Changed the signals code to match the new order of things. Also the new xtime_lock code needed to be picked up. It made some things a lot simpler. - Fixed a spin lock hand off problem in locking timers (thanks to Randy). - Fixed nanosleep to test for out of bound nanoseconds (thanks to Julie). - Fixed a couple of id deallocation bugs that left old ids laying around (hey I get this one). - This version has a new timer id manager. Andrew Morton suggested elimination of recursion (done) and I added code to allow it to release unused nodes. The prior version only released the leaf nodes. (The id manager uses radix tree type nodes.) Also added is a reuse count so ids will not repeat for at least 256 alloc/ free cycles. - The changes for the new sys_call restart now allow one restart function to handle both nanosleep and clock_nanosleep. Saves a bit of code, nice. - All the requested changes and Lindent too :). - I also broke clock_nanosleep() apart much the same way nanosleep() was with the 2.5.50-bk5 changes. TIMER STORMS The POSIX clocks and timers code prevents "timer storms" by not putting repeating timers back in the timer list until the signal is delivered for the prior expiry. Timer events missed by this delay are accounted for in the timer overrun count. The net result is MUCH lower system overhead while presenting the same info to the user as would be the case if an interrupt and timer processing were required for each increment in the overrun count.
db8b50ba · George Anzinger · Linus Torvalds · 307d1849 · db8b50ba · db8b50ba
Commit db8b50ba authored Feb 17, 2003 by George Anzinger Committed by Linus Torvalds Feb 17, 2003
23 changed files
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -41,7 +41,6 @@
 */

 #include <linux/config.h>
-#include <linux/sys.h>
 #include <linux/linkage.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
@@ -252,7 +251,7 @@ ENTRY(sysenter_entry)
 	pushl %eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	cmpl $(NR_syscalls), %eax
+	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys

 	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
@@ -276,7 +275,7 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-	cmpl $(NR_syscalls), %eax
+	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 					# system call tracing in operation
 	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
@@ -339,7 +338,7 @@ syscall_trace_entry:
 	xorl %edx,%edx
 	call do_syscall_trace
 	movl ORIG_EAX(%esp), %eax
-	cmpl $(NR_syscalls), %eax
+	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit

@@ -801,8 +800,15 @@ ENTRY(sys_call_table)
 	.long sys_epoll_wait
 	.long sys_remap_file_pages
 	.long sys_set_tid_address
-
-
-	.rept NR_syscalls-(.-sys_call_table)/4
-		.long sys_ni_syscall
-	.endr
+ 	.long sys_timer_create
+ 	.long sys_timer_settime		/* 260 */
+ 	.long sys_timer_gettime
+ 	.long sys_timer_getoverrun
+ 	.long sys_timer_delete
+ 	.long sys_clock_settime
+ 	.long sys_clock_gettime		/* 265 */
+ 	.long sys_clock_getres
+ 	.long sys_clock_nanosleep
+ 
+ 
+nr_syscalls=(.-sys_call_table)/4
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -135,6 +135,7 @@ void do_settimeofday(struct timeval *tv)
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
 	write_sequnlock_irq(&xtime_lock);
+	clock_was_set();
 }

 /*

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -823,6 +823,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 			
 	flush_signal_handlers(current);
 	flush_old_files(current->files);
+	exit_itimers(current);

 	return 0;


--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -43,8 +43,11 @@ typedef struct siginfo {

 		/* POSIX.1b timers */
 		struct {
-			unsigned int _timer1;
-			unsigned int _timer2;
+			timer_t _tid;		/* timer id */
+			int _overrun;		/* overrun count */
+			char _pad[sizeof( __ARCH_SI_UID_T) - sizeof(int)];
+			sigval_t _sigval;	/* same as below */
+			int _sys_private;       /* not to be passed to user */
 		} _timer;

 		/* POSIX.1b signals */
@@ -86,8 +89,9 @@ typedef struct siginfo {
 */
 #define si_pid		_sifields._kill._pid
 #define si_uid		_sifields._kill._uid
-#define si_timer1	_sifields._timer._timer1
-#define si_timer2	_sifields._timer._timer2
+#define si_tid		_sifields._timer._tid
+#define si_overrun	_sifields._timer._overrun
+#define si_sys_private  _sifields._timer._sys_private
 #define si_status	_sifields._sigchld._status
 #define si_utime	_sifields._sigchld._utime
 #define si_stime	_sifields._sigchld._stime
@@ -221,6 +225,7 @@ typedef struct siginfo {
 #define SIGEV_SIGNAL	0	/* notify via signal */
 #define SIGEV_NONE	1	/* other notification: meaningless */
 #define SIGEV_THREAD	2	/* deliver via thread creation */
+#define SIGEV_THREAD_ID 4	/* deliver to thread */

 #define SIGEV_MAX_SIZE	64
 #ifndef SIGEV_PAD_SIZE
@@ -235,6 +240,7 @@ typedef struct sigevent {
 	int sigev_notify;
 	union {
 		int _pad[SIGEV_PAD_SIZE];
+		 int _tid;

 		struct {
 			void (*_function)(sigval_t);
@@ -247,10 +253,12 @@ typedef struct sigevent {

 #define sigev_notify_function	_sigev_un._sigev_thread._function
 #define sigev_notify_attributes	_sigev_un._sigev_thread._attribute
+#define sigev_notify_thread_id	 _sigev_un._tid

 #ifdef __KERNEL__

 struct siginfo;
+void do_schedule_next_timer(struct siginfo *info);

 #ifndef HAVE_ARCH_COPY_SIGINFO


--- a/include/asm-i386/posix_types.h
+++ b/include/asm-i386/posix_types.h
@@ -22,6 +22,8 @@ typedef int		__kernel_ptrdiff_t;
 typedef long		__kernel_time_t;
 typedef long		__kernel_suseconds_t;
 typedef long		__kernel_clock_t;
+typedef int		__kernel_timer_t;
+typedef int		__kernel_clockid_t;
 typedef int		__kernel_daddr_t;
 typedef char *		__kernel_caddr_t;
 typedef unsigned short	__kernel_uid16_t;

--- a/include/asm-i386/signal.h
+++ b/include/asm-i386/signal.h
@@ -3,6 +3,7 @@

 #include <linux/types.h>
 #include <linux/linkage.h>
+#include <linux/time.h>

 /* Avoid too many header ordering problems.  */
 struct siginfo;

--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -264,7 +264,17 @@
 #define __NR_epoll_wait		256
 #define __NR_remap_file_pages	257
 #define __NR_set_tid_address	258
+#define __NR_timer_create	259
+#define __NR_timer_settime	(__NR_timer_create+1)
+#define __NR_timer_gettime	(__NR_timer_create+2)
+#define __NR_timer_getoverrun	(__NR_timer_create+3)
+#define __NR_timer_delete	(__NR_timer_create+4)
+#define __NR_clock_settime	(__NR_timer_create+5)
+#define __NR_clock_gettime	(__NR_timer_create+6)
+#define __NR_clock_getres	(__NR_timer_create+7)
+#define __NR_clock_nanosleep	(__NR_timer_create+8)

+#define NR_syscalls 268

 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */


--- a/include/linux/idr.h
+++ b/include/linux/idr.h
+/*
+ * include/linux/id.h
+ * 
+ * 2002-10-18  written by Jim Houston jim.houston@ccur.com
+ *	Copyright (C) 2002 by Concurrent Computer Corporation
+ *	Distributed under the GNU GPL license version 2.
+ *
+ * Small id to pointer translation service avoiding fixed sized
+ * tables.
+ */
+#include <linux/types.h>
+#include <asm/bitops.h>
+
+#define RESERVED_ID_BITS 8
+
+#if     BITS_PER_LONG == 32
+#define IDR_BITS 5
+#define IDR_FULL 0xffffffff
+#elif BITS_PER_LONG == 64
+#define IDR_BITS 6
+#define IDR_FULL 0xffffffffffffffff
+#else
+#error "BITS_PER_LONG is not 32 or 64"
+#endif
+
+#define IDR_MASK ((1 << IDR_BITS)-1)
+
+/* Leave the possibility of an incomplete final layer */
+#define MAX_LEVEL (BITS_PER_LONG - RESERVED_ID_BITS + IDR_BITS - 1) / IDR_BITS
+#define MAX_ID_SHIFT (BITS_PER_LONG - RESERVED_ID_BITS)
+#define MAX_ID_BIT (1 << MAX_ID_SHIFT)
+#define MAX_ID_MASK (MAX_ID_BIT - 1)
+
+/* Number of id_layer structs to leave in free list */
+#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
+
+struct idr_layer {
+	unsigned long	        bitmap;     // A zero bit means "space here"
+	int                     count;      // When zero, we can release it
+	struct idr_layer       *ary[1<<IDR_BITS];
+};
+
+struct idr {
+	struct idr_layer *top;
+	int		  layers;
+	int		  count;
+	struct idr_layer *id_free;
+	int               id_free_cnt;
+	spinlock_t        lock;
+};
+
+/*
+ * This is what we export.
+ */
+
+void *idr_find(struct idr *idp, int id);
+int idr_pre_get(struct idr *idp);
+int idr_get_new(struct idr *idp, void *ptr);
+void idr_remove(struct idr *idp, int id);
+void idr_init(struct idr *idp);
+
+extern kmem_cache_t *idr_layer_cache;
+
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -98,6 +98,7 @@
 	.sighand	= &init_sighand,				\
 	.pending	= { NULL, &tsk.pending.head, {{0}}},		\
 	.blocked	= {{0}},					\
+	 .posix_timers	 = LIST_HEAD_INIT(tsk.posix_timers),		   \
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\

--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
+#ifndef _linux_POSIX_TIMERS_H
+#define _linux_POSIX_TIMERS_H
+
+struct k_clock {
+	int res;		/* in nano seconds */
+	int (*clock_set) (struct timespec * tp);
+	int (*clock_get) (struct timespec * tp);
+	int (*nsleep) (int flags,
+		       struct timespec * new_setting,
+		       struct itimerspec * old_setting);
+	int (*timer_set) (struct k_itimer * timr, int flags,
+			  struct itimerspec * new_setting,
+			  struct itimerspec * old_setting);
+	int (*timer_del) (struct k_itimer * timr);
+	void (*timer_get) (struct k_itimer * timr,
+			   struct itimerspec * cur_setting);
+};
+struct now_struct {
+	unsigned long jiffies;
+};
+
+#define posix_get_now(now) (now)->jiffies = jiffies;
+#define posix_time_before(timer, now) \
+                      time_before((timer)->expires, (now)->jiffies)
+
+#define posix_bump_timer(timr) do { \
+                        (timr)->it_timer.expires += (timr)->it_incr; \
+                        (timr)->it_overrun++;               \
+                       }while (0)
+#endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -295,6 +295,25 @@ extern struct user_struct root_user;
 typedef struct prio_array prio_array_t;
 struct backing_dev_info;

+/* POSIX.1b interval timer structure. */
+struct k_itimer {
+	struct list_head list;		 /* free/ allocate list */
+	spinlock_t it_lock;
+	clockid_t it_clock;		/* which timer type */
+	timer_t it_id;			/* timer id */
+	int it_overrun;			/* overrun on pending signal  */
+	int it_overrun_last;		 /* overrun on last delivered signal */
+	int it_requeue_pending;          /* waiting to requeue this timer */
+	int it_sigev_notify;		 /* notify word of sigevent struct */
+	int it_sigev_signo;		 /* signo word of sigevent struct */
+	sigval_t it_sigev_value;	 /* value word of sigevent struct */
+	unsigned long it_incr;		/* interval specified in jiffies */
+	struct task_struct *it_process;	/* process to send signal to */
+	struct timer_list it_timer;
+};
+
+
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -358,6 +377,7 @@ struct task_struct {
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
+	struct list_head posix_timers; /* POSIX.1b Interval Timers */
 	unsigned long utime, stime, cutime, cstime;
 	u64 start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
@@ -597,6 +617,7 @@ extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
+extern void exit_itimers(struct task_struct *);

 extern NORET_TYPE void do_group_exit(int);


--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -208,7 +208,7 @@ extern int sigprocmask(int, sigset_t *, sigset_t *);
 struct pt_regs;
 extern int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs, void *cookie);
 #endif
-
+#define FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP
 #endif /* __KERNEL__ */

 #endif /* _LINUX_SIGNAL_H */
--- a/include/linux/sys.h
+++ b/include/linux/sys.h
@@ -2,9 +2,8 @@
 #define _LINUX_SYS_H

 /*
- * system call entry points ... but not all are defined
+ * This file is no longer used or needed
 */
-#define NR_syscalls 260

 /*
 * These are system calls that will be removed at some time

--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -41,6 +41,19 @@ struct timezone {
 */
 #define MAX_JIFFY_OFFSET ((~0UL >> 1)-1)

+/* Parameters used to convert the timespec values */
+#ifndef USEC_PER_SEC
+#define USEC_PER_SEC (1000000L)
+#endif
+
+#ifndef NSEC_PER_SEC
+#define NSEC_PER_SEC (1000000000L)
+#endif
+
+#ifndef NSEC_PER_USEC
+#define NSEC_PER_USEC (1000L)
+#endif
+
 static __inline__ unsigned long
 timespec_to_jiffies(struct timespec *value)
 {
@@ -139,6 +152,8 @@ struct timespec current_kernel_time(void);
 #ifdef __KERNEL__
 extern void do_gettimeofday(struct timeval *tv);
 extern void do_settimeofday(struct timeval *tv);
+extern int do_sys_settimeofday(struct timeval *tv, struct timezone *tz);
+extern void clock_was_set(void); // call when ever the clock is set
 extern long do_nanosleep(struct timespec *t);
 extern long do_utimes(char * filename, struct timeval * times);
 #endif
@@ -167,4 +182,24 @@ struct	itimerval {
 	struct	timeval it_value;	/* current value */
 };

+
+/*
+ * The IDs of the various system clocks (for POSIX.1b interval timers).
+ */
+#define CLOCK_REALTIME		  0
+#define CLOCK_MONOTONIC	  1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID	 3
+#define CLOCK_REALTIME_HR	 4
+#define CLOCK_MONOTONIC_HR	  5
+
+#define MAX_CLOCKS 6
+
+/*
+ * The various flags for setting POSIX.1b interval timers.
+ */
+
+#define TIMER_ABSTIME 0x01
+
+
 #endif
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -27,6 +27,8 @@ typedef __kernel_pid_t		pid_t;
 typedef __kernel_daddr_t	daddr_t;
 typedef __kernel_key_t		key_t;
 typedef __kernel_suseconds_t	suseconds_t;
+typedef __kernel_timer_t	timer_t;
+typedef __kernel_clockid_t	clockid_t;

 #ifdef __KERNEL__
 typedef __kernel_uid32_t	uid_t;

--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,7 +6,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    exit.o itimer.o time.o softirq.o resource.o \
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o futex.o pid.o \
-	    rcupdate.o intermodule.o extable.o params.o
+	    rcupdate.o intermodule.o extable.o params.o posix-timers.o

 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -460,6 +460,7 @@ void end_lazy_tlb(struct mm_struct *mm)
 	mmdrop(active_mm);
 }

+
 /*
 * Turn us into a lazy TLB process if we
 * aren't already..
@@ -741,6 +742,7 @@ NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
+	exit_itimers(tsk);
 	exit_thread();

 	if (tsk->leader)

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -813,6 +813,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,

 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
+	INIT_LIST_HEAD(&p->posix_timers);
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);

--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
+/*
+ * linux/kernel/posix_timers.c
+ *
+ * 
+ * 2002-10-15  Posix Clocks & timers by George Anzinger
+ *			     Copyright (C) 2002 by MontaVista Software.
+ */
+
+/* These are all the functions necessary to implement 
+ * POSIX clocks & timers
+ */
+
+#include <linux/mm.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/idr.h>
+#include <linux/posix-timers.h>
+
+#ifndef div_long_long_rem
+#include <asm/div64.h>
+
+#define div_long_long_rem(dividend,divisor,remainder) ({ \
+		       u64 result = dividend;		\
+		       *remainder = do_div(result,divisor); \
+		       result; })
+
+#endif				/* ifndef div_long_long_rem */
+
+/*
+ * Management arrays for POSIX timers.	 Timers are kept in slab memory
+ * Timer ids are allocated by an external routine that keeps track of the
+ * id and the timer.  The external interface is:
+ *
+ *void *idr_find(struct idr *idp, int id);           to find timer_id <id>
+ *int idr_get_new(struct idr *idp, void *ptr);       to get a new id and 
+ *                                                  related it to <ptr>
+ *void idr_remove(struct idr *idp, int id);          to release <id>
+ *void idr_init(struct idr *idp);                    to initialize <idp>
+ *                                                  which we supply.
+ * The idr_get_new *may* call slab for more memory so it must not be
+ * called under a spin lock.  Likewise idr_remore may release memory
+ * (but it may be ok to do this under a lock...).
+ * idr_find is just a memory look up and is quite fast.  A zero return
+ * indicates that the requested id does not exist.
+
+ */
+/*
+   * Lets keep our timers in a slab cache :-)
+ */
+static kmem_cache_t *posix_timers_cache;
+struct idr posix_timers_id;
+spinlock_t idr_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * Just because the timer is not in the timer list does NOT mean it is
+ * inactive.  It could be in the "fire" routine getting a new expire time.
+ */
+#define TIMER_INACTIVE 1
+#define TIMER_RETRY 1
+#ifdef CONFIG_SMP
+#define timer_active(tmr) (tmr->it_timer.entry.prev != (void *)TIMER_INACTIVE)
+#define set_timer_inactive(tmr) tmr->it_timer.entry.prev = (void *)TIMER_INACTIVE
+#else
+#define timer_active(tmr) BARFY	// error to use outside of SMP
+#define set_timer_inactive(tmr)
+#endif
+/*
+ * The timer ID is turned into a timer address by idr_find().
+ * Verifying a valid ID consists of:
+ * 
+ * a) checking that idr_find() returns other than zero.
+ * b) checking that the timer id matches the one in the timer itself.
+ * c) that the timer owner is in the callers thread group.
+ */
+
+
+/* 
+ * CLOCKs: The POSIX standard calls for a couple of clocks and allows us
+ *	    to implement others.  This structure defines the various
+ *	    clocks and allows the possibility of adding others.	 We
+ *	    provide an interface to add clocks to the table and expect
+ *	    the "arch" code to add at least one clock that is high
+ *	    resolution.	 Here we define the standard CLOCK_REALTIME as a
+ *	    1/HZ resolution clock.
+
+ * CPUTIME & THREAD_CPUTIME: We are not, at this time, definding these
+ *	    two clocks (and the other process related clocks (Std
+ *	    1003.1d-1999).  The way these should be supported, we think,
+ *	    is to use large negative numbers for the two clocks that are
+ *	    pinned to the executing process and to use -pid for clocks
+ *	    pinned to particular pids.	Calls which supported these clock
+ *	    ids would split early in the function.
+ 
+ * RESOLUTION: Clock resolution is used to round up timer and interval
+ *	    times, NOT to report clock times, which are reported with as
+ *	    much resolution as the system can muster.  In some cases this
+ *	    resolution may depend on the underlaying clock hardware and
+ *	    may not be quantifiable until run time, and only then is the
+ *	    necessary code is written.	The standard says we should say
+ *	    something about this issue in the documentation...
+
+ * FUNCTIONS: The CLOCKs structure defines possible functions to handle
+ *	    various clock functions.  For clocks that use the standard
+ *	    system timer code these entries should be NULL.  This will
+ *	    allow dispatch without the overhead of indirect function
+ *	    calls.  CLOCKS that depend on other sources (e.g. WWV or GPS)
+ *	    must supply functions here, even if the function just returns
+ *	    ENOSYS.  The standard POSIX timer management code assumes the
+ *	    following: 1.) The k_itimer struct (sched.h) is used for the
+ *	    timer.  2.) The list, it_lock, it_clock, it_id and it_process
+ *	    fields are not modified by timer code. 
+ *
+ *          At this time all functions EXCEPT clock_nanosleep can be
+ *          redirected by the CLOCKS structure.  Clock_nanosleep is in
+ *          there, but the code ignors it.
+ *
+ * Permissions: It is assumed that the clock_settime() function defined
+ *	    for each clock will take care of permission checks.	 Some
+ *	    clocks may be set able by any user (i.e. local process
+ *	    clocks) others not.	 Currently the only set able clock we
+ *	    have is CLOCK_REALTIME and its high res counter part, both of
+ *	    which we beg off on and pass to do_sys_settimeofday().
+ */
+
+struct k_clock posix_clocks[MAX_CLOCKS];
+
+#define if_clock_do(clock_fun, alt_fun,parms)	(! clock_fun)? alt_fun parms :\
+							      clock_fun parms
+
+#define p_timer_get( clock,a,b) if_clock_do((clock)->timer_get, \
+					     do_timer_gettime,	 \
+					     (a,b))
+
+#define p_nsleep( clock,a,b,c) if_clock_do((clock)->nsleep,   \
+					    do_nsleep,	       \
+					    (a,b,c))
+
+#define p_timer_del( clock,a) if_clock_do((clock)->timer_del, \
+					   do_timer_delete,    \
+					   (a))
+
+void register_posix_clock(int clock_id, struct k_clock *new_clock);
+
+static int do_posix_gettime(struct k_clock *clock, struct timespec *tp);
+
+int do_posix_clock_monotonic_gettime(struct timespec *tp);
+
+int do_posix_clock_monotonic_settime(struct timespec *tp);
+static struct k_itimer *lock_timer(timer_t timer_id, long *flags);
+static inline void unlock_timer(struct k_itimer *timr, long flags);
+
+/* 
+ * Initialize everything, well, just everything in Posix clocks/timers ;)
+ */
+
+static __init int
+init_posix_timers(void)
+{
+	struct k_clock clock_realtime = {.res = NSEC_PER_SEC / HZ };
+	struct k_clock clock_monotonic = {.res = NSEC_PER_SEC / HZ,
+		.clock_get = do_posix_clock_monotonic_gettime,
+		.clock_set = do_posix_clock_monotonic_settime
+	};
+
+	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
+	register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
+
+	posix_timers_cache = kmem_cache_create("posix_timers_cache",
+					       sizeof (struct k_itimer), 0, 0,
+					       0, 0);
+	idr_init(&posix_timers_id);
+	return 0;
+}
+
+__initcall(init_posix_timers);
+
+static inline int
+tstojiffie(struct timespec *tp, int res, unsigned long *jiff)
+{
+	unsigned long sec = tp->tv_sec;
+	long nsec = tp->tv_nsec + res - 1;
+
+	if (nsec > NSEC_PER_SEC) {
+		sec++;
+		nsec -= NSEC_PER_SEC;
+	}
+
+	/*
+	 * A note on jiffy overflow: It is possible for the system to
+	 * have been up long enough for the jiffies quanity to overflow.
+	 * In order for correct timer evaluations we require that the
+	 * specified time be somewhere between now and now + (max
+	 * unsigned int/2).  Times beyond this will be truncated back to
+	 * this value.   This is done in the absolute adjustment code,
+	 * below.  Here it is enough to just discard the high order
+	 * bits.  
+	 */
+	*jiff = HZ * sec;
+	/*
+	 * Do the res thing. (Don't forget the add in the declaration of nsec) 
+	 */
+	nsec -= nsec % res;
+	/*
+	 * Split to jiffie and sub jiffie
+	 */
+	*jiff += nsec / (NSEC_PER_SEC / HZ);
+	/*
+	 * We trust that the optimizer will use the remainder from the 
+	 * above div in the following operation as long as they are close. 
+	 */
+	return 0;
+}
+static void
+tstotimer(struct itimerspec *time, struct k_itimer *timer)
+{
+	int res = posix_clocks[timer->it_clock].res;
+	tstojiffie(&time->it_value, res, &timer->it_timer.expires);
+	tstojiffie(&time->it_interval, res, &timer->it_incr);
+}
+
+static void
+schedule_next_timer(struct k_itimer *timr)
+{
+	struct now_struct now;
+
+	/* Set up the timer for the next interval (if there is one) */
+	if (timr->it_incr == 0) {
+		{
+			set_timer_inactive(timr);
+			return;
+		}
+	}
+	posix_get_now(&now);
+	while (posix_time_before(&timr->it_timer, &now)) {
+		posix_bump_timer(timr);
+	};
+	timr->it_overrun_last = timr->it_overrun;
+	timr->it_overrun = -1;
+	timr->it_requeue_pending = 0;
+	add_timer(&timr->it_timer);
+}
+
+/*
+
+ * This function is exported for use by the signal deliver code.  It is
+ * called just prior to the info block being released and passes that
+ * block to us.  It's function is to update the overrun entry AND to
+ * restart the timer.  It should only be called if the timer is to be
+ * restarted (i.e. we have flagged this in the sys_private entry of the
+ * info block).
+ *
+ * To protect aginst the timer going away while the interrupt is queued,
+ * we require that the it_requeue_pending flag be set.
+
+ */
+void
+do_schedule_next_timer(struct siginfo *info)
+{
+
+	struct k_itimer *timr;
+	long flags;
+
+	timr = lock_timer(info->si_tid, &flags);
+
+	if (!timr || !timr->it_requeue_pending)
+		goto exit;
+
+	schedule_next_timer(timr);
+	info->si_overrun = timr->it_overrun_last;
+      exit:
+	if (timr)
+		unlock_timer(timr, flags);
+}
+
+/* 
+
+ * Notify the task and set up the timer for the next expiration (if
+ * applicable).  This function requires that the k_itimer structure
+ * it_lock is taken.  This code will requeue the timer only if we get
+ * either an error return or a flag (ret > 0) from send_seg_info
+ * indicating that the signal was either not queued or was queued
+ * without an info block.  In this case, we will not get a call back to
+ * do_schedule_next_timer() so we do it here.  This should be rare...
+
+ */
+
+static void
+timer_notify_task(struct k_itimer *timr)
+{
+	struct siginfo info;
+	int ret;
+
+	memset(&info, 0, sizeof (info));
+
+	/* Send signal to the process that owns this timer. */
+	info.si_signo = timr->it_sigev_signo;
+	info.si_errno = 0;
+	info.si_code = SI_TIMER;
+	info.si_tid = timr->it_id;
+	info.si_value = timr->it_sigev_value;
+	if (timr->it_incr == 0) {
+		set_timer_inactive(timr);
+	} else {
+		timr->it_requeue_pending = info.si_sys_private = 1;
+	}
+	ret = send_sig_info(info.si_signo, &info, timr->it_process);
+	switch (ret) {
+
+	default:
+		/*
+		 * Signal was not sent.  May or may not need to
+		 * restart the timer.
+		 */
+		printk(KERN_WARNING "sending signal failed: %d\n", ret);
+	case 1:
+		/*
+		 * signal was not sent because of sig_ignor or,
+		 * possibly no queue memory OR will be sent but,
+		 * we will not get a call back to restart it AND
+		 * it should be restarted. 
+		 */
+		schedule_next_timer(timr);
+	case 0:
+		/* 
+		 * all's well new signal queued
+		 */
+		break;
+	}
+}
+
+/*
+
+ * This function gets called when a POSIX.1b interval timer expires.  It
+ * is used as a callback from the kernel internal timer.  The
+ * run_timer_list code ALWAYS calls with interrutps on.
+
+ */
+static void
+posix_timer_fn(unsigned long __data)
+{
+	struct k_itimer *timr = (struct k_itimer *) __data;
+	long flags;
+
+	spin_lock_irqsave(&timr->it_lock, flags);
+	timer_notify_task(timr);
+	unlock_timer(timr, flags);
+}
+
+/*
+ * For some reason mips/mips64 define the SIGEV constants plus 128.  
+ * Here we define a mask to get rid of the common bits.	 The 
+ * optimizer should make this costless to all but mips.
+ */
+#if (ARCH == mips) || (ARCH == mips64)
+#define MIPS_SIGEV ~(SIGEV_NONE & \
+		      SIGEV_SIGNAL & \
+		      SIGEV_THREAD &  \
+		      SIGEV_THREAD_ID)
+#else
+#define MIPS_SIGEV (int)-1
+#endif
+
+static inline struct task_struct *
+good_sigevent(sigevent_t * event)
+{
+	struct task_struct *rtn = current;
+
+	if (event->sigev_notify & SIGEV_THREAD_ID & MIPS_SIGEV) {
+		if (!(rtn =
+		      find_task_by_pid(event->sigev_notify_thread_id)) ||
+		    rtn->tgid != current->tgid) {
+			return NULL;
+		}
+	}
+	if (event->sigev_notify & SIGEV_SIGNAL & MIPS_SIGEV) {
+		if ((unsigned) (event->sigev_signo > SIGRTMAX))
+			return NULL;
+	}
+	if (event->sigev_notify & ~(SIGEV_SIGNAL | SIGEV_THREAD_ID)) {
+		return NULL;
+	}
+	return rtn;
+}
+
+void
+register_posix_clock(int clock_id, struct k_clock *new_clock)
+{
+	if ((unsigned) clock_id >= MAX_CLOCKS) {
+		printk("POSIX clock register failed for clock_id %d\n",
+		       clock_id);
+		return;
+	}
+	posix_clocks[clock_id] = *new_clock;
+}
+
+static struct k_itimer *
+alloc_posix_timer(void)
+{
+	struct k_itimer *tmr;
+	tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL);
+	memset(tmr, 0, sizeof (struct k_itimer));
+	return (tmr);
+}
+
+static void
+release_posix_timer(struct k_itimer *tmr)
+{
+	if (tmr->it_id != -1){
+		spin_lock_irq(&idr_lock);
+		idr_remove(&posix_timers_id, tmr->it_id);
+		spin_unlock_irq(&idr_lock);
+	}
+	kmem_cache_free(posix_timers_cache, tmr);
+}
+
+/* Create a POSIX.1b interval timer. */
+
+asmlinkage int
+sys_timer_create(clockid_t which_clock,
+		 struct sigevent *timer_event_spec, timer_t * created_timer_id)
+{
+	int error = 0;
+	struct k_itimer *new_timer = NULL;
+	timer_t new_timer_id;
+	struct task_struct *process = 0;
+	sigevent_t event;
+
+	if ((unsigned) which_clock >= MAX_CLOCKS ||
+	    !posix_clocks[which_clock].res) return -EINVAL;
+
+	new_timer = alloc_posix_timer();
+	if (unlikely (new_timer == NULL))
+		return -EAGAIN;
+
+	spin_lock_init(&new_timer->it_lock);
+	do {
+		if ( unlikely ( !idr_pre_get(&posix_timers_id))){
+			error = -EAGAIN;
+			new_timer_id = (timer_t)-1;
+			goto out;			
+		}
+		spin_lock_irq(&idr_lock);
+		new_timer_id = (timer_t) idr_get_new(
+			&posix_timers_id, (void *) new_timer);
+		spin_unlock_irq(&idr_lock);
+	}while( unlikely (new_timer_id == -1));
+
+	new_timer->it_id = new_timer_id;
+	/*
+	 * return the timer_id now.  The next step is hard to 
+	 * back out if there is an error.
+	 */
+	if (copy_to_user(created_timer_id,
+			 &new_timer_id, sizeof (new_timer_id))) {
+		error = -EFAULT;
+		goto out;
+	}
+	if (timer_event_spec) {
+		if (copy_from_user(&event, timer_event_spec, sizeof (event))) {
+			error = -EFAULT;
+			goto out;
+		}
+		read_lock(&tasklist_lock);
+		if ((process = good_sigevent(&event))) {
+			/*
+
+			 * We may be setting up this process for another
+			 * thread.  It may be exitiing.  To catch this
+			 * case the we check the PF_EXITING flag.  If
+			 * the flag is not set, the task_lock will catch
+			 * him before it is too late (in exit_itimers).
+
+			 * The exec case is a bit more invloved but easy
+			 * to code.  If the process is in our thread
+			 * group (and it must be or we would not allow
+			 * it here) and is doing an exec, it will cause
+			 * us to be killed.  In this case it will wait
+			 * for us to die which means we can finish this
+			 * linkage with our last gasp. I.e. no code :)
+
+			 */
+			task_lock(process);
+			if (!(process->flags & PF_EXITING)) {
+				list_add(&new_timer->list,
+					 &process->posix_timers);
+				task_unlock(process);
+			} else {
+				task_unlock(process);
+				process = 0;
+			}
+		}
+		read_unlock(&tasklist_lock);
+		if (!process) {
+			error = -EINVAL;
+			goto out;
+		}
+		new_timer->it_sigev_notify = event.sigev_notify;
+		new_timer->it_sigev_signo = event.sigev_signo;
+		new_timer->it_sigev_value = event.sigev_value;
+	} else {
+		new_timer->it_sigev_notify = SIGEV_SIGNAL;
+		new_timer->it_sigev_signo = SIGALRM;
+		new_timer->it_sigev_value.sival_int = new_timer->it_id;
+		process = current;
+		task_lock(process);
+		list_add(&new_timer->list, &process->posix_timers);
+		task_unlock(process);
+	}
+
+	new_timer->it_clock = which_clock;
+	new_timer->it_incr = 0;
+	new_timer->it_overrun = -1;
+	init_timer(&new_timer->it_timer);
+	new_timer->it_timer.expires = 0;
+	new_timer->it_timer.data = (unsigned long) new_timer;
+	new_timer->it_timer.function = posix_timer_fn;
+	set_timer_inactive(new_timer);
+
+	/*
+	 * Once we set the process, it can be found so do it last...
+	 */
+	new_timer->it_process = process;
+
+      out:
+	if (error) {
+		release_posix_timer(new_timer);
+	}
+	return error;
+}
+
+/*
+ * good_timespec
+ *
+ * This function checks the elements of a timespec structure.
+ *
+ * Arguments:
+ * ts	     : Pointer to the timespec structure to check
+ *
+ * Return value: 
+ * If a NULL pointer was passed in, or the tv_nsec field was less than 0
+ * or greater than NSEC_PER_SEC, or the tv_sec field was less than 0,
+ * this function returns 0. Otherwise it returns 1.
+
+ */
+
+static int
+good_timespec(const struct timespec *ts)
+{
+	if ((ts == NULL) ||
+	    (ts->tv_sec < 0) ||
+	    ((unsigned) ts->tv_nsec >= NSEC_PER_SEC)) return 0;
+	return 1;
+}
+
+static inline void
+unlock_timer(struct k_itimer *timr, long flags)
+{
+	spin_unlock_irqrestore(&timr->it_lock, flags);
+}
+
+/*
+
+ * Locking issues: We need to protect the result of the id look up until
+ * we get the timer locked down so it is not deleted under us.  The
+ * removal is done under the idr spinlock so we use that here to bridge
+ * the find to the timer lock.  To avoid a dead lock, the timer id MUST
+ * be release with out holding the timer lock.
+
+ */
+static struct k_itimer *
+lock_timer(timer_t timer_id, long *flags)
+{
+	struct k_itimer *timr;
+	/*
+	 * Watch out here.  We do a irqsave on the idr_lock and pass the 
+	 * flags part over to the timer lock.  Must not let interrupts in
+	 * while we are moving the lock.
+	 */
+
+	spin_lock_irqsave(&idr_lock, *flags);
+	timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id);
+	if (timr) {
+		spin_lock(&timr->it_lock);
+		spin_unlock(&idr_lock);
+
+		if ( (timr->it_id != timer_id) || !(timr->it_process) ||
+		     timr->it_process->tgid != current->tgid) {
+			unlock_timer(timr, *flags);
+			timr = NULL;
+		}
+	} else {
+		spin_unlock_irqrestore(&idr_lock, *flags);
+	}
+
+	return timr;
+}
+
+/* 
+
+ * Get the time remaining on a POSIX.1b interval timer.  This function
+ * is ALWAYS called with spin_lock_irq on the timer, thus it must not
+ * mess with irq.
+
+ * We have a couple of messes to clean up here.  First there is the case
+ * of a timer that has a requeue pending.  These timers should appear to
+ * be in the timer list with an expiry as if we were to requeue them
+ * now.
+
+ * The second issue is the SIGEV_NONE timer which may be active but is
+ * not really ever put in the timer list (to save system resources).
+ * This timer may be expired, and if so, we will do it here.  Otherwise
+ * it is the same as a requeue pending timer WRT to what we should
+ * report.
+
+ */
+void inline
+do_timer_gettime(struct k_itimer *timr, struct itimerspec *cur_setting)
+{
+	long sub_expires;
+	unsigned long expires;
+	struct now_struct now;
+
+	do {
+		expires = timr->it_timer.expires;
+	} while ((volatile long) (timr->it_timer.expires) != expires);
+
+	posix_get_now(&now);
+
+	if (expires && (timr->it_sigev_notify & SIGEV_NONE) && !timr->it_incr) {
+		if (posix_time_before(&timr->it_timer, &now)) {
+			timr->it_timer.expires = expires = 0;
+		}
+	}
+	if (expires) {
+		if (timr->it_requeue_pending ||
+		    (timr->it_sigev_notify & SIGEV_NONE)) {
+			while (posix_time_before(&timr->it_timer, &now)) {
+				posix_bump_timer(timr);
+			};
+		} else {
+			if (!timer_pending(&timr->it_timer)) {
+				sub_expires = expires = 0;
+			}
+		}
+		if (expires) {
+			expires -= now.jiffies;
+		}
+	}
+	jiffies_to_timespec(expires, &cur_setting->it_value);
+	jiffies_to_timespec(timr->it_incr, &cur_setting->it_interval);
+
+	if (cur_setting->it_value.tv_sec < 0) {
+		cur_setting->it_value.tv_nsec = 1;
+		cur_setting->it_value.tv_sec = 0;
+	}
+}
+/* Get the time remaining on a POSIX.1b interval timer. */
+asmlinkage int
+sys_timer_gettime(timer_t timer_id, struct itimerspec *setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec cur_setting;
+	long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	p_timer_get(&posix_clocks[timr->it_clock], timr, &cur_setting);
+
+	unlock_timer(timr, flags);
+
+	if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+		return -EFAULT;
+
+	return 0;
+}
+/*
+
+ * Get the number of overruns of a POSIX.1b interval timer.  This is to
+ * be the overrun of the timer last delivered.  At the same time we are
+ * accumulating overruns on the next timer.  The overrun is frozen when
+ * the signal is delivered, either at the notify time (if the info block
+ * is not queued) or at the actual delivery time (as we are informed by
+ * the call back to do_schedule_next_timer().  So all we need to do is
+ * to pick up the frozen overrun.
+
+ */
+
+asmlinkage int
+sys_timer_getoverrun(timer_t timer_id)
+{
+	struct k_itimer *timr;
+	int overrun;
+	long flags;
+
+	timr = lock_timer(timer_id, &flags);
+	if (!timr)
+		return -EINVAL;
+
+	overrun = timr->it_overrun_last;
+	unlock_timer(timr, flags);
+
+	return overrun;
+}
+/* Adjust for absolute time */
+/*
+ * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
+ * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
+ * what ever clock he is using.
+ *
+ * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
+ * time to it to get the proper time for the timer.
+ */
+static int
+adjust_abs_time(struct k_clock *clock, struct timespec *tp, int abs)
+{
+	struct timespec now;
+	struct timespec oc;
+	do_posix_clock_monotonic_gettime(&now);
+
+	if (abs &&
+	    (posix_clocks[CLOCK_MONOTONIC].clock_get == clock->clock_get)) {
+	} else {
+
+		if (abs) {
+			do_posix_gettime(clock, &oc);
+		} else {
+			oc.tv_nsec = oc.tv_sec = 0;
+		}
+		tp->tv_sec += now.tv_sec - oc.tv_sec;
+		tp->tv_nsec += now.tv_nsec - oc.tv_nsec;
+
+		/* 
+		 * Normalize...
+		 */
+		if ((tp->tv_nsec - NSEC_PER_SEC) >= 0) {
+			tp->tv_nsec -= NSEC_PER_SEC;
+			tp->tv_sec++;
+		}
+		if ((tp->tv_nsec) < 0) {
+			tp->tv_nsec += NSEC_PER_SEC;
+			tp->tv_sec--;
+		}
+	}
+	/*
+	 * Check if the requested time is prior to now (if so set now) or
+	 * is more than the timer code can handle (if so we error out).
+	 * The (unsigned) catches the case of prior to "now" with the same
+	 * test.  Only on failure do we sort out what happened, and then
+	 * we use the (unsigned) to error out negative seconds.
+	 */
+	if ((unsigned) (tp->tv_sec - now.tv_sec) > (MAX_JIFFY_OFFSET / HZ)) {
+		if ((unsigned) tp->tv_sec < now.tv_sec) {
+			tp->tv_sec = now.tv_sec;
+			tp->tv_nsec = now.tv_nsec;
+		} else {
+			// tp->tv_sec = now.tv_sec + (MAX_JIFFY_OFFSET / HZ);
+			/*
+			 * This is a considered response, not exactly in
+			 * line with the standard (in fact it is silent on
+			 * possible overflows).  We assume such a large 
+			 * value is ALMOST always a programming error and
+			 * try not to compound it by setting a really dumb
+			 * value.
+			 */
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/* Set a POSIX.1b interval timer. */
+/* timr->it_lock is taken. */
+static inline int
+do_timer_settime(struct k_itimer *timr, int flags,
+		 struct itimerspec *new_setting, struct itimerspec *old_setting)
+{
+	struct k_clock *clock = &posix_clocks[timr->it_clock];
+
+	if (old_setting) {
+		do_timer_gettime(timr, old_setting);
+	}
+
+	/* disable the timer */
+	timr->it_incr = 0;
+	/* 
+	 * careful here.  If smp we could be in the "fire" routine which will
+	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
+	 */
+#ifdef CONFIG_SMP
+	if (timer_active(timr) && !del_timer(&timr->it_timer)) {
+		/*
+		 * It can only be active if on an other cpu.  Since
+		 * we have cleared the interval stuff above, it should
+		 * clear once we release the spin lock.  Of course once
+		 * we do that anything could happen, including the 
+		 * complete melt down of the timer.  So return with 
+		 * a "retry" exit status.
+		 */
+		return TIMER_RETRY;
+	}
+	set_timer_inactive(timr);
+#else
+	del_timer(&timr->it_timer);
+#endif
+	timr->it_requeue_pending = 0;
+	timr->it_overrun_last = 0;
+	timr->it_overrun = -1;
+	/* 
+	 *switch off the timer when it_value is zero 
+	 */
+	if ((new_setting->it_value.tv_sec == 0) &&
+	    (new_setting->it_value.tv_nsec == 0)) {
+		timr->it_timer.expires = 0;
+		return 0;
+	}
+
+	if ((flags & TIMER_ABSTIME) &&
+	    (clock->clock_get != do_posix_clock_monotonic_gettime)) {
+	}
+	if (adjust_abs_time(clock,
+			    &new_setting->it_value, flags & TIMER_ABSTIME)) {
+		return -EINVAL;
+	}
+	tstotimer(new_setting, timr);
+
+	/*
+	 * For some reason the timer does not fire immediately if expires is
+	 * equal to jiffies, so the timer notify function is called directly.
+	 * We do not even queue SIGEV_NONE timers!
+	 */
+	if (!(timr->it_sigev_notify & SIGEV_NONE)) {
+		if (timr->it_timer.expires == jiffies) {
+			timer_notify_task(timr);
+		} else
+			add_timer(&timr->it_timer);
+	}
+	return 0;
+}
+
+/* Set a POSIX.1b interval timer */
+asmlinkage int
+sys_timer_settime(timer_t timer_id, int flags,
+		  const struct itimerspec *new_setting,
+		  struct itimerspec *old_setting)
+{
+	struct k_itimer *timr;
+	struct itimerspec new_spec, old_spec;
+	int error = 0;
+	long flag;
+	struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+
+	if (new_setting == NULL) {
+		return -EINVAL;
+	}
+
+	if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) {
+		return -EFAULT;
+	}
+
+	if ((!good_timespec(&new_spec.it_interval)) ||
+	    (!good_timespec(&new_spec.it_value))) {
+		return -EINVAL;
+	}
+      retry:
+	timr = lock_timer(timer_id, &flag);
+	if (!timr)
+		return -EINVAL;
+
+	if (!posix_clocks[timr->it_clock].timer_set) {
+		error = do_timer_settime(timr, flags, &new_spec, rtn);
+	} else {
+		error = posix_clocks[timr->it_clock].timer_set(timr,
+							       flags,
+							       &new_spec, rtn);
+	}
+	unlock_timer(timr, flag);
+	if (error == TIMER_RETRY) {
+		rtn = NULL;	// We already got the old time...
+		goto retry;
+	}
+
+	if (old_setting && !error) {
+		if (copy_to_user(old_setting, &old_spec, sizeof (old_spec))) {
+			error = -EFAULT;
+		}
+	}
+
+	return error;
+}
+
+static inline int
+do_timer_delete(struct k_itimer *timer)
+{
+	timer->it_incr = 0;
+#ifdef CONFIG_SMP
+	if (timer_active(timer) &&
+	    !del_timer(&timer->it_timer) && !timer->it_requeue_pending) {
+		/*
+		 * It can only be active if on an other cpu.  Since
+		 * we have cleared the interval stuff above, it should
+		 * clear once we release the spin lock.  Of course once
+		 * we do that anything could happen, including the 
+		 * complete melt down of the timer.  So return with 
+		 * a "retry" exit status.
+		 */
+		return TIMER_RETRY;
+	}
+#else
+	del_timer(&timer->it_timer);
+#endif
+	return 0;
+}
+
+/* Delete a POSIX.1b interval timer. */
+asmlinkage int
+sys_timer_delete(timer_t timer_id)
+{
+	struct k_itimer *timer;
+	long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+      retry_delete:
+#endif
+
+	timer = lock_timer(timer_id, &flags);
+	if (!timer)
+		return -EINVAL;
+
+#ifdef CONFIG_SMP
+	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+
+	task_lock(timer->it_process);
+
+	list_del(&timer->list);
+
+	task_unlock(timer->it_process);
+
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	timer->it_process = NULL;
+	unlock_timer(timer, flags);
+	release_posix_timer(timer);
+	return 0;
+}
+/*
+ * return  timer owned by the process, used by exit_itimers
+ */
+static inline void
+itimer_delete(struct k_itimer *timer)
+{
+	if (sys_timer_delete(timer->it_id)) {
+		BUG();
+	}
+}
+/*
+ * This is exported to exit and exec
+ */
+void
+exit_itimers(struct task_struct *tsk)
+{
+	struct k_itimer *tmr;
+
+	task_lock(tsk);
+	while (!list_empty(&tsk->posix_timers)) {
+		tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
+		task_unlock(tsk);
+		itimer_delete(tmr);
+		task_lock(tsk);
+	}
+	task_unlock(tsk);
+}
+
+/*
+ * And now for the "clock" calls
+
+ * These functions are called both from timer functions (with the timer
+ * spin_lock_irq() held and from clock calls with no locking.	They must
+ * use the save flags versions of locks.
+ */
+static int
+do_posix_gettime(struct k_clock *clock, struct timespec *tp)
+{
+
+	if (clock->clock_get) {
+		return clock->clock_get(tp);
+	}
+
+	do_gettimeofday((struct timeval *) tp);
+	tp->tv_nsec *= NSEC_PER_USEC;
+	return 0;
+}
+
+/*
+ * We do ticks here to avoid the irq lock ( they take sooo long).
+ * The seqlock is great here.  Since we a reader, we don't really care
+ * if we are interrupted since we don't take lock that will stall us or 
+ * any other cpu. Voila, no irq lock is needed.
+
+ * Note also that the while loop assures that the sub_jiff_offset
+ * will be less than a jiffie, thus no need to normalize the result.
+ * Well, not really, if called with ints off :(
+ */
+
+int
+do_posix_clock_monotonic_gettime(struct timespec *tp)
+{
+	long sub_sec;
+	u64 jiffies_64_f;
+
+#if (BITS_PER_LONG > 32)
+
+	jiffies_64_f = jiffies_64;
+
+#else
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		jiffies_64_f = jiffies_64;
+
+	}while(  read_seqretry(&xtime_lock, seq));
+
+#endif
+	tp->tv_sec = div_long_long_rem(jiffies_64_f, HZ, &sub_sec);
+
+	tp->tv_nsec = sub_sec * (NSEC_PER_SEC / HZ);
+	return 0;
+}
+
+int
+do_posix_clock_monotonic_settime(struct timespec *tp)
+{
+	return -EINVAL;
+}
+
+asmlinkage int
+sys_clock_settime(clockid_t which_clock, const struct timespec *tp)
+{
+	struct timespec new_tp;
+
+	if ((unsigned) which_clock >= MAX_CLOCKS ||
+	    !posix_clocks[which_clock].res) return -EINVAL;
+	if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+		return -EFAULT;
+	if (posix_clocks[which_clock].clock_set) {
+		return posix_clocks[which_clock].clock_set(&new_tp);
+	}
+	new_tp.tv_nsec /= NSEC_PER_USEC;
+	return do_sys_settimeofday((struct timeval *) &new_tp, NULL);
+}
+asmlinkage int
+sys_clock_gettime(clockid_t which_clock, struct timespec *tp)
+{
+	struct timespec rtn_tp;
+	int error = 0;
+
+	if ((unsigned) which_clock >= MAX_CLOCKS ||
+	    !posix_clocks[which_clock].res) return -EINVAL;
+
+	error = do_posix_gettime(&posix_clocks[which_clock], &rtn_tp);
+
+	if (!error) {
+		if (copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+			error = -EFAULT;
+		}
+	}
+	return error;
+
+}
+asmlinkage int
+sys_clock_getres(clockid_t which_clock, struct timespec *tp)
+{
+	struct timespec rtn_tp;
+
+	if ((unsigned) which_clock >= MAX_CLOCKS ||
+	    !posix_clocks[which_clock].res) return -EINVAL;
+
+	rtn_tp.tv_sec = 0;
+	rtn_tp.tv_nsec = posix_clocks[which_clock].res;
+	if (tp) {
+		if (copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+			return -EFAULT;
+		}
+	}
+	return 0;
+
+}
+static void
+nanosleep_wake_up(unsigned long __data)
+{
+	struct task_struct *p = (struct task_struct *) __data;
+
+	wake_up_process(p);
+}
+
+/*
+ * The standard says that an absolute nanosleep call MUST wake up at
+ * the requested time in spite of clock settings.  Here is what we do:
+ * For each nanosleep call that needs it (only absolute and not on 
+ * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
+ * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
+ * When ever the clock is set we just wake up all those tasks.	 The rest
+ * is done by the while loop in clock_nanosleep().
+
+ * On locking, clock_was_set() is called from update_wall_clock which 
+ * holds (or has held for it) a write_lock_irq( xtime_lock) and is 
+ * called from the timer bh code.  Thus we need the irq save locks.
+ */
+spinlock_t nanosleep_abs_list_lock = SPIN_LOCK_UNLOCKED;
+
+struct list_head nanosleep_abs_list = LIST_HEAD_INIT(nanosleep_abs_list);
+
+struct abs_struct {
+	struct list_head list;
+	struct task_struct *t;
+};
+
+void
+clock_was_set(void)
+{
+	struct list_head *pos;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nanosleep_abs_list_lock, flags);
+	list_for_each(pos, &nanosleep_abs_list) {
+		wake_up_process(list_entry(pos, struct abs_struct, list)->t);
+	}
+	spin_unlock_irqrestore(&nanosleep_abs_list_lock, flags);
+}
+
+long clock_nanosleep_restart(struct restart_block *restart_block);
+
+extern long do_clock_nanosleep(clockid_t which_clock, int flags, 
+			       struct timespec *t);
+
+#ifdef FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP
+
+asmlinkage long
+sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
+{
+	struct timespec t;
+	long ret;
+
+	if (copy_from_user(&t, rqtp, sizeof (t)))
+		return -EFAULT;
+
+	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+		return -EINVAL;
+
+	ret = do_clock_nanosleep(CLOCK_REALTIME, 0, &t);
+
+	if (ret == -ERESTART_RESTARTBLOCK && rmtp && 
+	    copy_to_user(rmtp, &t, sizeof (t)))
+			return -EFAULT;
+	return ret;
+}
+#endif				// ! FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP
+
+asmlinkage long
+sys_clock_nanosleep(clockid_t which_clock, int flags,
+		    const struct timespec *rqtp, struct timespec *rmtp)
+{
+	struct timespec t;
+	int ret;
+
+	if ((unsigned) which_clock >= MAX_CLOCKS ||
+	    !posix_clocks[which_clock].res) return -EINVAL;
+
+	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+		return -EFAULT;
+
+	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+		return -EINVAL;
+
+	ret = do_clock_nanosleep(which_clock, flags, &t);
+
+	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp && 
+	    copy_to_user(rmtp, &t, sizeof (t)))
+			return -EFAULT;
+	return ret;
+
+}
+
+long
+do_clock_nanosleep(clockid_t which_clock, int flags, struct timespec *tsave)
+{
+	struct timespec t;
+	struct timer_list new_timer;
+	struct abs_struct abs_struct = { list:{next:0} };
+	int abs;
+	int rtn = 0;
+	int active;
+	struct restart_block *restart_block =
+	    &current_thread_info()->restart_block;
+
+	init_timer(&new_timer);
+	new_timer.expires = 0;
+	new_timer.data = (unsigned long) current;
+	new_timer.function = nanosleep_wake_up;
+	abs = flags & TIMER_ABSTIME;
+
+	if (restart_block->fn == clock_nanosleep_restart) {
+		/*
+		 * Interrupted by a non-delivered signal, pick up remaining
+		 * time and continue.
+		 */
+		restart_block->fn = do_no_restart_syscall;
+		if (!restart_block->arg2)
+			return -EINTR;
+
+		new_timer.expires = restart_block->arg2;
+		if (time_before(new_timer.expires, jiffies))
+			return 0;
+	}
+
+	if (abs && (posix_clocks[which_clock].clock_get !=
+		    posix_clocks[CLOCK_MONOTONIC].clock_get)) {
+		spin_lock_irq(&nanosleep_abs_list_lock);
+		list_add(&abs_struct.list, &nanosleep_abs_list);
+		abs_struct.t = current;
+		spin_unlock_irq(&nanosleep_abs_list_lock);
+	}
+	do {
+		t = *tsave;
+		if ((abs || !new_timer.expires) &&
+		    !(rtn = adjust_abs_time(&posix_clocks[which_clock],
+					    &t, abs))) {
+			/*
+			 * On error, we don't set up the timer so
+			 * we don't arm the timer so
+			 * del_timer_sync() will return 0, thus
+			 * active is zero... and so it goes.
+			 */
+
+			tstojiffie(&t,
+				   posix_clocks[which_clock].res,
+				   &new_timer.expires);
+		}
+		if (new_timer.expires) {
+			current->state = TASK_INTERRUPTIBLE;
+			add_timer(&new_timer);
+
+			schedule();
+		}
+	}
+	while ((active = del_timer_sync(&new_timer)) &&
+	       !test_thread_flag(TIF_SIGPENDING));
+
+	if (abs_struct.list.next) {
+		spin_lock_irq(&nanosleep_abs_list_lock);
+		list_del(&abs_struct.list);
+		spin_unlock_irq(&nanosleep_abs_list_lock);
+	}
+	if (active) {
+		unsigned long jiffies_f = jiffies;
+
+		/*
+		 * Always restart abs calls from scratch to pick up any
+		 * clock shifting that happened while we are away.
+		 */
+		if (abs)
+			return -ERESTARTNOHAND;
+
+		jiffies_to_timespec(new_timer.expires - jiffies_f, tsave);
+
+		while (tsave->tv_nsec < 0) {
+			tsave->tv_nsec += NSEC_PER_SEC;
+			tsave->tv_sec--;
+		}
+		if (tsave->tv_sec < 0) {
+			tsave->tv_sec = 0;
+			tsave->tv_nsec = 1;
+		}
+		restart_block->fn = clock_nanosleep_restart;
+		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (int)tsave;
+		restart_block->arg2 = new_timer.expires;
+		return -ERESTART_RESTARTBLOCK;
+	}
+
+	return rtn;
+}
+/*
+ * This will restart either clock_nanosleep or clock_nanosleep
+ */
+long
+clock_nanosleep_restart(struct restart_block *restart_block)
+{
+	struct timespec t;
+	int ret = do_clock_nanosleep(restart_block->arg0, 0, &t);
+
+	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 && 
+	    copy_to_user((struct timespec *)(restart_block->arg1), &t, 
+			 sizeof (t)))
+		return -EFAULT;
+	return ret;
+}
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -472,8 +472,6 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 		if (!collect_signal(sig, pending, info))
 			sig = 0;
 				
-		/* XXX: Once POSIX.1b timers are in, if si_code == SI_TIMER,
-		   we need to xchg out the timer overrun values.  */
 	}
 	recalc_sigpending();

@@ -492,6 +490,11 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	if (!signr)
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
+	if ( signr &&
+	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
+	     info->si_sys_private){
+		do_schedule_next_timer(info);
+	}
 	return signr;
 }

@@ -677,6 +680,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
 static int send_signal(int sig, struct siginfo *info, struct sigpending *signals)
 {
 	struct sigqueue * q = NULL;
+	int ret = 0;

 	/*
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
@@ -720,17 +724,25 @@ static int send_signal(int sig, struct siginfo *info, struct sigpending *signals
 			copy_siginfo(&q->info, info);
 			break;
 		}
-	} else if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+	} else {
+		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
 		   && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
+		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
+			/*
+			 * Set up a return to indicate that we dropped 
+			 * the signal.
+			 */
+			ret = info->si_sys_private;
+	}

 out_set:
 	sigaddset(&signals->signal, sig);
-	return 0;
+	return ret;
 }

 #define LEGACY_QUEUE(sigptr, sig) \
@@ -749,20 +761,26 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 #endif

+	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+		/*
+		 * Set up a return to indicate that we dropped the signal.
+		 */
+		ret = info->si_sys_private;
+
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
-		return 0;
+		goto out;

 	/* Support queueing exactly one non-rt signal, so that we
 	   can get more detailed information about the cause of
 	   the signal. */
 	if (LEGACY_QUEUE(&t->pending, sig))
-		return 0;
+		goto out;

 	ret = send_signal(sig, info, &t->pending);
 	if (!ret && !sigismember(&t->blocked, sig))
 		signal_wake_up(t, sig == SIGKILL);
-
+out:
 	return ret;
 }

@@ -821,7 +839,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	struct task_struct *t;
 	unsigned int mask;
-	int ret;
+	int ret = 0;

 #if CONFIG_SMP
 	if (!spin_is_locked(&p->sighand->siglock))
@@ -829,13 +847,19 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 #endif
 	handle_stop_signal(sig, p);

+	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+		/*
+		 * Set up a return to indicate that we dropped the signal.
+		 */
+		ret = info->si_sys_private;
+
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig))
-		return 0;
+		return ret;

 	if (LEGACY_QUEUE(&p->signal->shared_pending, sig))
 		/* This is a non-RT signal and we already have one queued.  */
-		return 0;
+		return ret;

 	/*
 	 * Don't bother zombies and stopped tasks (but
@@ -1322,6 +1346,7 @@ do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent)
 static void
 finish_stop(int stop_count)
 {
+	int ret;
 	/*
 	 * If there are no other threads in the group, or if there is
 	 * a group stop in progress and we are the last to stop,
@@ -1753,8 +1778,9 @@ int copy_siginfo_to_user(siginfo_t *to, siginfo_t *from)
 		err |= __put_user(from->si_uid, &to->si_uid);
 		break;
 	case __SI_TIMER:
-		err |= __put_user(from->si_timer1, &to->si_timer1);
-		err |= __put_user(from->si_timer2, &to->si_timer2);
+		 err |= __put_user(from->si_tid, &to->si_tid);
+		 err |= __put_user(from->si_overrun, &to->si_overrun);
+		 err |= __put_user(from->si_ptr, &to->si_ptr);
 		break;
 	case __SI_POLL:
 		err |= __put_user(from->si_band, &to->si_band);

--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -52,12 +52,11 @@ typedef struct tvec_root_s {
 	struct list_head vec[TVR_SIZE];
 } tvec_root_t;

-typedef struct timer_list timer_t;

 struct tvec_t_base_s {
 	spinlock_t lock;
 	unsigned long timer_jiffies;
-	timer_t *running_timer;
+	struct timer_list *running_timer;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
@@ -70,7 +69,7 @@ typedef struct tvec_t_base_s tvec_base_t;
 /* Fake initialization */
 static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };

-static void check_timer_failed(timer_t *timer)
+static void check_timer_failed(struct timer_list *timer)
 {
 	static int whine_count;
 	if (whine_count < 16) {
@@ -88,13 +87,13 @@ static void check_timer_failed(timer_t *timer)
 	timer->magic = TIMER_MAGIC;
 }

-static inline void check_timer(timer_t *timer)
+static inline void check_timer(struct timer_list *timer)
 {
 	if (timer->magic != TIMER_MAGIC)
 		check_timer_failed(timer);
 }

-static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
+static inline void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
@@ -146,7 +145,7 @@ static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
 * Timers with an ->expired field in the past will be executed in the next
 * timer tick. It's illegal to add an already pending timer.
 */
-void add_timer(timer_t *timer)
+void add_timer(struct timer_list *timer)
 {
 	int cpu = get_cpu();
 	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
@@ -204,7 +203,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
 * active timer returns 1.)
 */
-int mod_timer(timer_t *timer, unsigned long expires)
+int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	tvec_base_t *old_base, *new_base;
 	unsigned long flags;
@@ -281,7 +280,7 @@ int mod_timer(timer_t *timer, unsigned long expires)
 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
 * active timer returns 1.)
 */
-int del_timer(timer_t *timer)
+int del_timer(struct timer_list *timer)
 {
 	unsigned long flags;
 	tvec_base_t *base;
@@ -320,7 +319,7 @@ int del_timer(timer_t *timer)
 *
 * The function returns whether it has deactivated a pending timer or not.
 */
-int del_timer_sync(timer_t *timer)
+int del_timer_sync(struct timer_list *timer)
 {
 	tvec_base_t *base;
 	int i, ret = 0;
@@ -363,9 +362,9 @@ static int cascade(tvec_base_t *base, tvec_t *tv)
 	 * detach them individually, just clear the list afterwards.
 	 */
 	while (curr != head) {
-		timer_t *tmp;
+		struct timer_list *tmp;

-		tmp = list_entry(curr, timer_t, entry);
+		tmp = list_entry(curr, struct timer_list, entry);
 		if (tmp->base != base)
 			BUG();
 		next = curr->next;
@@ -404,9 +403,9 @@ static inline void __run_timers(tvec_base_t *base)
 		if (curr != head) {
 			void (*fn)(unsigned long);
 			unsigned long data;
-			timer_t *timer;
+			struct timer_list *timer;

-			timer = list_entry(curr, timer_t, entry);
+			timer = list_entry(curr, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;

@@ -508,6 +507,7 @@ static void second_overflow(void)
 	if (xtime.tv_sec % 86400 == 0) {
 	    xtime.tv_sec--;
 	    time_state = TIME_OOP;
+	    clock_was_set();
 	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
 	}
 	break;
@@ -516,6 +516,7 @@ static void second_overflow(void)
 	if ((xtime.tv_sec + 1) % 86400 == 0) {
 	    xtime.tv_sec++;
 	    time_state = TIME_WAIT;
+	    clock_was_set();
 	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
 	}
 	break;
@@ -968,7 +969,7 @@ static void process_timeout(unsigned long __data)
 */
 signed long schedule_timeout(signed long timeout)
 {
-	timer_t timer;
+	struct timer_list timer;
 	unsigned long expire;

 	switch (timeout)
@@ -1023,6 +1024,7 @@ asmlinkage long sys_gettid(void)
 {
 	return current->pid;
 }
+#ifndef FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP

 static long nanosleep_restart(struct restart_block *restart)
 {
@@ -1081,6 +1083,7 @@ asmlinkage long sys_nanosleep(struct timespec *rqtp, struct timespec *rmtp)
 	}
 	return ret;
 }
+#endif // ! FOLD_NANO_SLEEP_INTO_CLOCK_NANO_SLEEP

 /*
 * sys_sysinfo - fill in sysinfo struct

--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,7 +10,7 @@ L_TARGET := lib.a

 obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
-	 kobject.o
+	 kobject.o idr.o

 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o

--- a/lib/idr.c
+++ b/lib/idr.c
+/*
+ * linux/kernel/id.c
+ *
+ * 2002-10-18  written by Jim Houston jim.houston@ccur.com
+ *	Copyright (C) 2002 by Concurrent Computer Corporation
+ *	Distributed under the GNU GPL license version 2.
+ *
+ * Small id to pointer translation service.  
+ *
+ * It uses a radix tree like structure as a sparse array indexed 
+ * by the id to obtain the pointer.  The bitmap makes allocating
+ * a new id quick.  
+
+ * Modified by George Anzinger to reuse immediately and to use
+ * find bit instructions.  Also removed _irq on spinlocks.
+
+ * So here is what this bit of code does:
+
+ * You call it to allocate an id (an int) an associate with that id a
+ * pointer or what ever, we treat it as a (void *).  You can pass this
+ * id to a user for him to pass back at a later time.  You then pass
+ * that id to this code and it returns your pointer.
+
+ * You can release ids at any time. When all ids are released, most of 
+ * the memory is returned (we keep IDR_FREE_MAX) in a local pool so we
+ * don't need to go to the memory "store" during an id allocate, just 
+ * so you don't need to be too concerned about locking and conflicts
+ * with the slab allocator.
+
+ * A word on reuse.  We reuse empty id slots as soon as we can, always
+ * using the lowest one available.  But we also merge a counter in the
+ * high bits of the id.  The counter is RESERVED_ID_BITS (8 at this time)
+ * long.  This means that if you allocate and release the same id in a 
+ * loop we will reuse an id after about 256 times around the loop.  The
+ * word about is used here as we will NOT return a valid id of -1 so if
+ * you loop on the largest possible id (and that is 24 bits, wow!) we
+ * will kick the counter to avoid -1.  (Paranoid?  You bet!)
+ *
+ * What you need to do is, since we don't keep the counter as part of
+ * id / ptr pair, to keep a copy of it in the pointed to structure
+ * (or else where) so that when you ask for a ptr you can varify that
+ * the returned ptr is correct by comparing the id it contains with the one
+ * you asked for.  In other words, we only did half the reuse protection.
+ * Since the code depends on your code doing this check, we ignore high
+ * order bits in the id, not just the count, but bits that would, if used,
+ * index outside of the allocated ids.  In other words, if the largest id
+ * currently allocated is 32 a look up will only look at the low 5 bits of
+ * the id.  Since you will want to keep this id in the structure anyway
+ * (if for no other reason than to be able to eliminate the id when the
+ * structure is found in some other way) this seems reasonable.  If you
+ * really think otherwise, the code to check these bits here, it is just
+ * disabled with a #if 0.
+
+
+ * So here are the complete details:
+
+ *  include <linux/idr.h>
+
+ * void idr_init(struct idr *idp)
+
+ *   This function is use to set up the handle (idp) that you will pass
+ *   to the rest of the functions.  The structure is defined in the
+ *   header.
+
+ * int idr_pre_get(struct idr *idp)
+
+ *   This function should be called prior to locking and calling the
+ *   following function.  It pre allocates enough memory to satisfy the
+ *   worst possible allocation.  It can sleep, so must not be called
+ *   with any spinlocks held.  If the system is REALLY out of memory
+ *   this function returns 0, other wise 1.
+
+ * int idr_get_new(struct idr *idp, void *ptr);
+ 
+ *   This is the allocate id function.  It should be called with any
+ *   required locks.  In fact, in the SMP case, you MUST lock prior to
+ *   calling this function to avoid possible out of memory problems.  If
+ *   memory is required, it will return a -1, in which case you should
+ *   unlock and go back to the idr_pre_get() call.  ptr is the pointer
+ *   you want associated with the id.  In other words:
+
+ * void *idr_find(struct idr *idp, int id);
+ 
+ *   returns the "ptr", given the id.  A NULL return indicates that the
+ *   id is not valid (or you passed NULL in the idr_get_new(), shame on
+ *   you).  This function must be called with a spinlock that prevents
+ *   calling either idr_get_new() or idr_remove() or idr_find() while it
+ *   is working.
+
+ * void idr_remove(struct idr *idp, int id);
+
+ *   removes the given id, freeing that slot and any memory that may
+ *   now be unused.  See idr_find() for locking restrictions.
+
+ */
+
+
+
+#ifndef TEST                        // to test in user space...
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#endif
+#include <linux/string.h>
+#include <linux/idr.h>
+
+
+static kmem_cache_t *idr_layer_cache;
+
+
+
+static inline struct idr_layer *alloc_layer(struct idr *idp)
+{
+	struct idr_layer *p;
+
+	spin_lock(&idp->lock);
+	if (!(p = idp->id_free))
+		BUG();
+	idp->id_free = p->ary[0];
+	idp->id_free_cnt--;
+	p->ary[0] = 0;
+	spin_unlock(&idp->lock);
+	return(p);
+}
+
+static inline void free_layer(struct idr *idp, struct idr_layer *p)
+{
+	/*
+	 * Depends on the return element being zeroed.
+	 */
+	spin_lock(&idp->lock);
+	p->ary[0] = idp->id_free;
+	idp->id_free = p;
+	idp->id_free_cnt++;
+	spin_unlock(&idp->lock);
+}
+
+int idr_pre_get(struct idr *idp)
+{
+	while (idp->id_free_cnt < idp->layers + 1) {
+		struct idr_layer *new;
+		new = kmem_cache_alloc(idr_layer_cache, GFP_KERNEL);
+		if(new == NULL)
+			return (0);
+		free_layer(idp, new);
+	}
+	return 1;
+}
+EXPORT_SYMBOL(idr_pre_get);
+
+static inline int sub_alloc(struct idr *idp, int shift, void *ptr)
+{
+	int n, v = 0;
+	struct idr_layer *p;
+	struct idr_layer **pa[MAX_LEVEL];
+	struct idr_layer ***paa = &pa[0];
+	
+	*paa = NULL;
+	*++paa = &idp->top;
+
+	/*
+	 * By keeping each pointer in an array we can do the 
+	 * "after" recursion processing.  In this case, that means
+	 * we can update the upper level bit map.
+	 */
+	
+	while (1){
+		p = **paa;
+		n = ffz(p->bitmap);
+		if (shift){
+			/*
+			 * We run around this while until we
+			 * reach the leaf node...
+			 */
+			if (!p->ary[n]){
+				/*
+				 * If no node, allocate one, AFTER
+				 * we insure that we will not
+				 * intrude on the reserved bit field.
+				 */
+				if ((n << shift) >= MAX_ID_BIT)
+					return -1;
+				p->ary[n] = alloc_layer(idp);
+				p->count++;
+			}
+			*++paa = &p->ary[n];
+			v += (n << shift);
+			shift -= IDR_BITS;
+		} else {
+			/*
+			 * We have reached the leaf node, plant the
+			 * users pointer and return the raw id.
+			 */
+			p->ary[n] = (struct idr_layer *)ptr;
+			__set_bit(n, &p->bitmap);
+			v += n;
+			p->count++;
+			/*
+			 * This is the post recursion processing.  Once
+			 * we find a bitmap that is not full we are
+			 * done
+			 */
+			while (*(paa-1) && (**paa)->bitmap == IDR_FULL){
+				n = *paa - &(**(paa-1))->ary[0];
+				__set_bit(n, &(**--paa)->bitmap);
+			}
+			return(v);
+		}
+	}
+}
+
+int idr_get_new(struct idr *idp, void *ptr)
+{
+	int v;
+	
+	if (idp->id_free_cnt < idp->layers + 1) 
+		return (-1);
+	/*
+	 * Add a new layer if the array is full 
+	 */
+	if (unlikely(!idp->top || idp->top->bitmap == IDR_FULL)){
+		/*
+		 * This is a bit different than the lower layers because
+		 * we have one branch already allocated and full.
+		 */
+		struct idr_layer *new = alloc_layer(idp);
+		new->ary[0] = idp->top;
+		if ( idp->top)
+			++new->count;
+		idp->top = new;
+		if ( idp->layers++ )
+			__set_bit(0, &new->bitmap);
+	}
+	v = sub_alloc(idp,  (idp->layers - 1) * IDR_BITS, ptr);
+	if ( likely(v >= 0 )){
+		idp->count++;
+		v += (idp->count << MAX_ID_SHIFT);
+		if ( unlikely( v == -1 ))
+		     v += (1 << MAX_ID_SHIFT);
+	}
+	return(v);
+}
+EXPORT_SYMBOL(idr_get_new);
+
+
+static inline void sub_remove(struct idr *idp, int shift, int id)
+{
+	struct idr_layer *p = idp->top;
+	struct idr_layer **pa[MAX_LEVEL];
+	struct idr_layer ***paa = &pa[0];
+
+	*paa = NULL;
+	*++paa = &idp->top;
+
+	while ((shift > 0) && p) {
+		int n = (id >> shift) & IDR_MASK;
+		__clear_bit(n, &p->bitmap);
+		*++paa = &p->ary[n];
+		p = p->ary[n];
+		shift -= IDR_BITS;
+	}
+	if ( likely(p)){
+		int n = id & IDR_MASK;
+		__clear_bit(n, &p->bitmap);
+		p->ary[n] = NULL;
+		while(*paa && ! --((**paa)->count)){
+			free_layer(idp, **paa);
+			**paa-- = NULL;
+		}
+		if ( ! *paa )
+			idp->layers = 0;
+	}
+}
+void idr_remove(struct idr *idp, int id)
+{
+	struct idr_layer *p;
+
+	sub_remove(idp, (idp->layers - 1) * IDR_BITS, id);
+	if ( idp->top && idp->top->count == 1 && 
+	     (idp->layers > 1) &&
+	     idp->top->ary[0]){  // We can drop a layer
+
+		p = idp->top->ary[0];
+		idp->top->bitmap = idp->top->count = 0;
+		free_layer(idp, idp->top);
+		idp->top = p;
+		--idp->layers;
+	}
+	while (idp->id_free_cnt >= IDR_FREE_MAX) {
+		
+		p = alloc_layer(idp);
+		kmem_cache_free(idr_layer_cache, p);
+		return;
+	}
+}
+EXPORT_SYMBOL(idr_remove);
+
+void *idr_find(struct idr *idp, int id)
+{
+	int n;
+	struct idr_layer *p;
+
+	n = idp->layers * IDR_BITS;
+	p = idp->top;
+#if 0
+	/*
+	 * This tests to see if bits outside the current tree are
+	 * present.  If so, tain't one of ours!
+	 */
+	if ( unlikely( (id & ~(~0 << MAX_ID_SHIFT)) >> (n + IDR_BITS)))
+	     return NULL;
+#endif
+	while (n > 0 && p) {
+		n -= IDR_BITS;
+		p = p->ary[(id >> n) & IDR_MASK];
+	}
+	return((void *)p);
+}
+EXPORT_SYMBOL(idr_find);
+
+static void idr_cache_ctor(void * idr_layer, 
+			   kmem_cache_t *idr_layer_cache, unsigned long flags)
+{
+	memset(idr_layer, 0, sizeof(struct idr_layer));
+}
+
+static  int init_id_cache(void)
+{
+	if (!idr_layer_cache)
+		idr_layer_cache = kmem_cache_create("idr_layer_cache", 
+			sizeof(struct idr_layer), 0, 0, idr_cache_ctor, 0);
+	return 0;
+}
+
+void idr_init(struct idr *idp)
+{
+	init_id_cache();
+	memset(idp, 0, sizeof(struct idr));
+	spin_lock_init(&idp->lock);
+}
+EXPORT_SYMBOL(idr_init);
+