diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile
index ade4fbf105c9cdf9362109ddb945c37039de2156..3cad76a0c4705a3ad4859f98731d32a8b95cc539 100644
--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -96,6 +96,8 @@ MAKEBOOT = $(MAKE) -C arch/$(ARCH)/boot
 
 vmlinux: arch/$(ARCH)/vmlinux.lds.s
 
+CPPFLAGS_arch/$(ARCH)/vmlinux.lds.s
+
 compressed: vmlinux
 	$(OBJCOPY) $(OBJCOPYFLAGS) vmlinux vmlinux-tmp
 	gzip vmlinux-tmp
@@ -104,30 +106,14 @@ compressed: vmlinux
 rawboot:
 	@$(MAKEBOOT) rawboot
 
-#
-# My boot writes directly to a specific disk partition, I doubt most
-# people will want to do that without changes..
-#
-msb my-special-boot:
-	@$(MAKEBOOT) msb
-
-bootimage:
-	@$(MAKEBOOT) bootimage
-
-srmboot:
-	@$(MAKEBOOT) srmboot
-
 archclean:
 	@$(MAKEBOOT) clean
 
 archmrproper:
 	@$(MAKE) -C arch/$(ARCH)/tools mrproper
 
-bootpfile:
-	@$(MAKEBOOT) bootpfile
-
 prepare: $(TOPDIR)/include/asm-ia64/offsets.h
 
 $(TOPDIR)/include/asm-ia64/offsets.h: include/asm include/linux/version.h \
 				      include/config/MARKER
-	@$(MAKE) -C arch/$(ARCH)/tools $@
\ No newline at end of file
+	@$(MAKE) -C arch/$(ARCH)/tools $@
diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 9a49666f2b6efacbdf67685c55896f9ae07d2a32..a4daee076ed46f1dd0188be8f412b8e111e8e02f 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -2111,8 +2111,8 @@ struct shm_info32 {
 };
 
 struct ipc_kludge {
-	struct msgbuf *msgp;
-	long msgtyp;
+	u32 msgp;
+	s32 msgtyp;
 };
 
 #define SEMOP		 1
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index 33c2d60b13b2e192d85d743d2eb231b689e611fc..1e34ad720b3e67e49efa5e6263c5f98630bf48e1 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -2,7 +2,7 @@
  * This file contains the code that gets mapped at the upper end of each task's text
  * region.  For now, it contains the signal trampoline code only.
  *
- * Copyright (C) 1999-2001 Hewlett-Packard Co
+ * Copyright (C) 1999-2002 Hewlett-Packard Co
  * 	David Mosberger-Tang <davidm@hpl.hp.com>
  */
 
@@ -135,7 +135,7 @@ back_from_setup_rbs:
 	;;
 	ld8 r8=[base0]				// restore (perhaps modified) CFM0, EC0, and CPL0
 	cmp.ne p8,p0=r14,r15			// do we need to restore the rbs?
-(p8)	br.cond.spnt restore_rbs		// yup -> (clobbers r14 and r16)
+(p8)	br.cond.spnt restore_rbs		// yup -> (clobbers r14-r18, f6 & f7)
 	;;
 back_from_restore_rbs:
 	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
@@ -189,20 +189,69 @@ setup_rbs:
 	.spillsp ar.rnat, RNAT_OFF+SIGCONTEXT_OFF
 	.body
 restore_rbs:
+	// On input:
+	//	r14 = bsp1 (bsp at the time of return from signal handler)
+	//	r15 = bsp0 (bsp at the time the signal occurred)
+	//
+	// Here, we need to calculate bspstore0, the value that ar.bspstore needs
+	// to be set to, based on bsp0 and the size of the dirty partition on
+	// the alternate stack (sc_loadrs >> 16).  This can be done with the
+	// following algorithm:
+	//
+	//  bspstore0 = rse_skip_regs(bsp0, -rse_num_regs(bsp1 - (loadrs >> 19), bsp1));
+	//
+	// This is what the code below does.
+	//
 	alloc r2=ar.pfs,0,0,0,0			// alloc null frame
 	adds r16=(LOADRS_OFF+SIGCONTEXT_OFF),sp
+	adds r18=(RNAT_OFF+SIGCONTEXT_OFF),sp
 	;;
-	ld8 r14=[r16]
-	adds r16=(RNAT_OFF+SIGCONTEXT_OFF),sp
+	ld8 r17=[r16]
+	ld8 r16=[r18]			// get new rnat
+	extr.u r18=r15,3,6	// r18 <- rse_slot_num(bsp0)
 	;;
-	mov ar.rsc=r14				// put RSE into enforced lazy mode
-	ld8 r14=[r16]				// get new rnat
+	mov ar.rsc=r17			// put RSE into enforced lazy mode
+	shr.u r17=r17,16
 	;;
-	loadrs					// restore dirty partition
+	sub r14=r14,r17		// r14 (bspstore1) <- bsp1 - (sc_loadrs >> 16)
+	shr.u r17=r17,3		// r17 <- (sc_loadrs >> 19)
+	;;
+	loadrs			// restore dirty partition
+	extr.u r14=r14,3,6	// r14 <- rse_slot_num(bspstore1)
+	;;
+	add r14=r14,r17		// r14 <- rse_slot_num(bspstore1) + (sc_loadrs >> 19)
+	;;
+	shr.u r14=r14,6		// r14 <- (rse_slot_num(bspstore1) + (sc_loadrs >> 19))/0x40
+	;;
+	sub r14=r14,r17		// r14 <- -rse_num_regs(bspstore1, bsp1)
+	movl r17=0x8208208208208209
+	;;
+	add r18=r18,r14		// r18 (delta) <- rse_slot_num(bsp0) - rse_num_regs(bspstore1,bsp1)
+	setf.sig f7=r17
+	cmp.lt p7,p0=r14,r0	// p7 <- (r14 < 0)?
+	;;
+(p7)	adds r18=-62,r18	// delta -= 62
+	;;
+	setf.sig f6=r18
+	;;
+	xmpy.h f6=f6,f7
+	;;
+	getf.sig r17=f6
+	;;
+	add r17=r17,r18
+	shr r18=r18,63
+	;;
+	shr r17=r17,5
+	;;
+	sub r17=r17,r18		// r17 = delta/63
+	;;
+	add r17=r14,r17		// r17 <- delta/63 - rse_num_regs(bspstore1, bsp1)
+	;;
+	shladd r15=r17,3,r15	// r15 <- bsp0 + 8*(delta/63 - rse_num_regs(bspstore1, bsp1))
 	;;
 	mov ar.bspstore=r15			// switch back to old register backing store area
 	;;
-	mov ar.rnat=r14				// restore RNaT
+	mov ar.rnat=r16				// restore RNaT
 	mov ar.rsc=0xf				// (will be restored later on from sc_ar_rsc)
 	// invala not necessary as that will happen when returning to user-mode
 	br.cond.sptk back_from_restore_rbs
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index babb029257705b6677264de792f5ace70b2e1c4d..1c664c25e982eb1c0d834f848f3941714d819617 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -127,6 +127,8 @@ EXPORT_SYMBOL(ia64_pal_call_phys_stacked);
 EXPORT_SYMBOL(ia64_pal_call_phys_static);
 EXPORT_SYMBOL(ia64_pal_call_stacked);
 EXPORT_SYMBOL(ia64_pal_call_static);
+EXPORT_SYMBOL(ia64_load_scratch_fpregs);
+EXPORT_SYMBOL(ia64_save_scratch_fpregs);
 
 extern struct efi efi;
 EXPORT_SYMBOL(efi);
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 36713af5ed0ef97cc80d2d70ac30ecaec94269ac..cde008c9c76b0063a1e17ca0c62a6dff0d5f7934 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -403,8 +403,8 @@ unsigned int do_IRQ(unsigned long irq, struct pt_regs *regs)
 				break;
 			desc->status &= ~IRQ_PENDING;
 		}
-	  out:
 		desc->status &= ~IRQ_INPROGRESS;
+	  out:
 		/*
 		 * The ->end() handler has to deal with interrupts which got
 		 * disabled while the handler was running.
@@ -788,7 +788,7 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 
 	if (!shared) {
 		desc->depth = 0;
-		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | IRQ_WAITING);
+		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | IRQ_WAITING | IRQ_INPROGRESS);
 		desc->handler->startup(irq);
 	}
 	spin_unlock_irqrestore(&desc->lock,flags);
diff --git a/arch/ia64/kernel/pal.S b/arch/ia64/kernel/pal.S
index a60bd219f431653178fc100b57bc6fd17abdbefd..ae0a0305d86e906ab4e8b142304948f03f327283 100644
--- a/arch/ia64/kernel/pal.S
+++ b/arch/ia64/kernel/pal.S
@@ -245,3 +245,48 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	br.ret.sptk.many b0
 END(ia64_pal_call_phys_stacked)
 
+/*
+ * Save scratch fp scratch regs which aren't saved in pt_regs already (fp10-fp15).
+ *
+ * NOTE: We need to do this since firmware (SAL and PAL) may use any of the scratch
+ * regs fp-low partition.
+ *
+ * Inputs:
+ *      in0	Address of stack storage for fp regs
+ */
+GLOBAL_ENTRY(ia64_save_scratch_fpregs)
+	alloc r3=ar.pfs,1,0,0,0
+	add r2=16,in0
+	;;
+	stf.spill [in0] = f10,32
+	stf.spill [r2]  = f11,32
+	;;
+	stf.spill [in0] = f12,32
+	stf.spill [r2]  = f13,32
+	;;
+	stf.spill [in0] = f14,32
+	stf.spill [r2]  = f15,32
+	br.ret.sptk.many rp
+END(ia64_save_scratch_fpregs)
+
+/*
+ * Load scratch fp scratch regs (fp10-fp15)
+ *
+ * Inputs:
+ *      in0	Address of stack storage for fp regs
+ */
+
+GLOBAL_ENTRY(ia64_load_scratch_fpregs)
+	alloc r3=ar.pfs,1,0,0,0
+	add r2=16,in0
+	;;
+	ldf.fill  f10 = [in0],32
+	ldf.fill  f11 = [r2],32
+	;;
+	ldf.fill  f12 = [in0],32
+	ldf.fill  f13 = [r2],32
+	;;
+	ldf.fill  f14 = [in0],32
+	ldf.fill  f15 = [r2],32
+	br.ret.sptk.many rp
+END(ia64_load_scratch_fpregs)
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 361d93363f5a43398538cf61f6c6841cced1c8ac..bd0f017f134ed15085b28e6a62fd9189834961c3 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -193,10 +193,12 @@ typedef enum {
  */
 typedef struct {
 	u64 val;	/* virtual 64bit counter value */
-	u64 ival;	/* initial value from user */
+	u64 lval;	/* last value */
 	u64 long_reset;	/* reset value on sampling overflow */
 	u64 short_reset;/* reset value on overflow */
 	u64 reset_pmds[4]; /* which other pmds to reset when this counter overflows */
+	u64 seed;	/* seed for random-number generator */
+	u64 mask;	/* mask for random-number generator */
 	int flags;	/* notify/do not notify */
 } pfm_counter_t;
 
@@ -336,7 +338,7 @@ typedef struct {
 
 #define PFM_CMD_PID		0x1	/* command requires pid argument */
 #define PFM_CMD_ARG_READ	0x2	/* command must read argument(s) */
-#define PFM_CMD_ARG_WRITE	0x4	/* command must write argument(s) */
+#define PFM_CMD_ARG_RW		0x4	/* command must read/write argument(s) */
 #define PFM_CMD_CTX		0x8	/* command needs a perfmon context */
 #define PFM_CMD_NOCHK		0x10	/* command does not need to check task's state */
 
@@ -347,7 +349,7 @@ typedef struct {
 
 #define PFM_CMD_USE_PID(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_PID) != 0)
 #define PFM_CMD_READ_ARG(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_READ) != 0)
-#define PFM_CMD_WRITE_ARG(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_WRITE) != 0)
+#define PFM_CMD_RW_ARG(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_ARG_RW) != 0)
 #define PFM_CMD_USE_CTX(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_CTX) != 0)
 #define PFM_CMD_CHK(cmd)	((pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_flags & PFM_CMD_NOCHK) == 0)
 
@@ -376,7 +378,9 @@ static pfm_session_t	pfm_sessions;	/* global sessions information */
 static struct proc_dir_entry *perfmon_dir; /* for debug only */
 static pfm_stats_t	pfm_stats;
 DEFINE_PER_CPU(int, pfm_syst_wide);
+#ifdef CONFIG_SMP
 static DEFINE_PER_CPU(int, pfm_dcr_pp);
+#endif
 
 /* sysctl() controls */
 static pfm_sysctl_t pfm_sysctl;
@@ -743,15 +747,14 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	psb = kmalloc(sizeof(*psb), GFP_KERNEL);
 	if (psb == NULL) {
 		DBprintk(("Can't allocate sampling buffer descriptor\n"));
-		pfm_rvfree(smpl_buf, size);
-		return -ENOMEM;
+		goto error_kmalloc;
 	}
 
 	/* allocate vma */
 	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 	if (!vma) {
 		DBprintk(("Cannot allocate vma\n"));
-		goto error;
+		goto error_kmem;
 	}
 	/*
 	 * partially initialize the vma for the sampling buffer
@@ -851,8 +854,11 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	return 0;
 
 error:
-	pfm_rvfree(smpl_buf, size);
+	kmem_cache_free(vm_area_cachep, vma);
+error_kmem:
 	kfree(psb);
+error_kmalloc:
+	pfm_rvfree(smpl_buf, size);
 	return -ENOMEM;
 }
 
@@ -961,7 +967,7 @@ pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int
 	 */
 	if (task != current) return -EINVAL;
 
-	if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
+	if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
 
 	ret = pfx_is_sane(task, &tmp);
 	if (ret < 0) return ret;
@@ -1034,7 +1040,10 @@ pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int
 			/*
 			 * check if we can send this task a signal
 			 */
-			if (pfm_bad_permissions(notify_task)) goto buffer_error;
+			if (pfm_bad_permissions(notify_task)) {
+				read_unlock(&tasklist_lock);
+				goto buffer_error;
+			}
 
 			/* 
 		 	 * make visible
@@ -1101,7 +1110,7 @@ pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int
 
 	sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */
 
-	if (copy_to_user(req, &tmp, sizeof(tmp))) {
+	if (__copy_to_user(req, &tmp, sizeof(tmp))) {
 		ret = -EFAULT;
 		goto buffer_error;
 	}
@@ -1150,13 +1159,32 @@ pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int
 	return ret;
 }
 
+static inline unsigned long
+pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
+{
+	unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
+	unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
+	extern unsigned long carta_random32 (unsigned long seed);
+
+	if (reg->flags & PFM_REGFL_RANDOM) {
+		new_seed = carta_random32(old_seed);
+		val -= (old_seed & mask);	/* counter values are negative numbers! */
+		if ((mask >> 32) != 0)
+			/* construct a full 64-bit random value: */
+			new_seed |= carta_random32(old_seed >> 32) << 32;
+		reg->seed = new_seed;
+	}
+	reg->lval = val;
+	return val;
+}
+
 static void
 pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
 {
 	unsigned long mask = ovfl_regs[0];
 	unsigned long reset_others = 0UL;
 	unsigned long val;
-	int i;
+	int i, is_long_reset = (flag & PFM_RELOAD_LONG_RESET);
 
 	DBprintk(("masks=0x%lx\n", mask));
 
@@ -1166,15 +1194,11 @@ pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
 	mask >>= PMU_FIRST_COUNTER;
 	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
 		if (mask & 0x1) {
-			val  = flag == PFM_RELOAD_LONG_RESET ? 
-					ctx->ctx_soft_pmds[i].long_reset:
-					ctx->ctx_soft_pmds[i].short_reset;
-
+			val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
 			reset_others |= ctx->ctx_soft_pmds[i].reset_pmds[0];
 
-			DBprintk(("[%d] %s reset soft_pmd[%d]=%lx\n", 
-			  	current->pid, 
-				flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val));
+			DBprintk(("[%d] %s reset soft_pmd[%d]=%lx\n", current->pid,
+				  is_long_reset ? "long" : "short", i, val));
 
 			/* upper part is ignored on rval */
 			pfm_write_soft_counter(ctx, i, val);
@@ -1188,19 +1212,15 @@ pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
 
 		if ((reset_others & 0x1) == 0) continue;
 
-		val  = flag == PFM_RELOAD_LONG_RESET ? 
-					ctx->ctx_soft_pmds[i].long_reset:
-					ctx->ctx_soft_pmds[i].short_reset;
+		val = pfm_new_counter_value(ctx->ctx_soft_pmds + i, is_long_reset);
 
 		if (PMD_IS_COUNTING(i)) {
 			pfm_write_soft_counter(ctx, i, val);
 		} else {
 			ia64_set_pmd(i, val);
 		}
-
-		DBprintk(("[%d] %s reset_others pmd[%d]=%lx\n", 
-			  	current->pid, 
-				flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val));
+		DBprintk(("[%d] %s reset_others pmd[%d]=%lx\n", current->pid,
+			  is_long_reset ? "long" : "short", i, val));
 	}
 	ia64_srlz_d();
 	/* just in case ! */
@@ -1225,8 +1245,7 @@ pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 
 	for (i = 0; i < count; i++, req++) {
 
-
-		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
+		if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
 
 		cnum = tmp.reg_num;
 
@@ -1283,6 +1302,9 @@ pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 			ctx->ctx_soft_pmds[cnum].reset_pmds[1] = tmp.reg_reset_pmds[1];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[2] = tmp.reg_reset_pmds[2];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[3] = tmp.reg_reset_pmds[3];
+
+			if (tmp.reg_flags & PFM_REGFL_RANDOM)
+				ctx->ctx_soft_pmds[cnum].flags |= PFM_REGFL_RANDOM;
 		}
 		/*
 		 * execute write checker, if any
@@ -1295,8 +1317,10 @@ pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 
 		/*
 		 * update register return value, abort all if problem during copy.
+		 * we only modify the reg_flags field. no check mode is fine because
+		 * access has been verified upfront in sys_perfmonctl().
 		 */
-		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
+		if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
 
 		/*
 		 * if there was something wrong on this register, don't touch
@@ -1306,7 +1330,7 @@ pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 		 * entry which has a return flag set is the one that caused the error.
 		 */
 		if (ret != 0) {
-			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
+			DBprintk(("[%d] pmc[%u]=0x%lx err %d\n",
 				  task->pid, cnum, tmp.reg_value, reg_retval));
 			break;
 		}
@@ -1359,21 +1383,24 @@ pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 
 	for (i = 0; i < count; i++, req++) {
 
-		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
+		if (__copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
 
 		cnum = tmp.reg_num;
+
 		if (!PMD_IS_IMPL(cnum)) {
+			DBprintk(("pmd[%u] is unimplemented or invalid\n", cnum));
 			ret = -EINVAL;
 			goto abort_mission;
 		}
 
 		/* update virtualized (64bits) counter */
 		if (PMD_IS_COUNTING(cnum)) {
-			ctx->ctx_soft_pmds[cnum].ival = tmp.reg_value;
+			ctx->ctx_soft_pmds[cnum].lval = tmp.reg_value;
 			ctx->ctx_soft_pmds[cnum].val  = tmp.reg_value & ~pmu_conf.perf_ovfl_val;
 			ctx->ctx_soft_pmds[cnum].long_reset = tmp.reg_long_reset;
 			ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;
-
+			ctx->ctx_soft_pmds[cnum].seed = tmp.reg_random_seed;
+			ctx->ctx_soft_pmds[cnum].mask = tmp.reg_random_mask;
 		}
 		/*
 		 * execute write checker, if any
@@ -1384,7 +1411,7 @@ pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 
 		PFM_REG_RETFLAG_SET(tmp.reg_flags, reg_retval);
 
-		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
+		if (__put_user(tmp.reg_flags, &req->reg_flags)) return -EFAULT;
 
 		/*
 		 * if there was something wrong on this register, don't touch
@@ -1394,8 +1421,8 @@ pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int coun
 		 * entry which has a return flag set is the one that caused the error.
 		 */
 		if (ret != 0) {
-			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
-				  task->pid, cnum, tmp.reg_value, reg_retval));
+			DBprintk(("[%d] pmc[%u]=0x%lx ret %d error %d\n",
+				  task->pid, cnum, tmp.reg_value, ret, reg_retval));
 			break;
 		}
 
@@ -1428,9 +1455,9 @@ static int
 pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
 	struct thread_struct *th = &task->thread;
-	unsigned long val=0;
-	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
-	unsigned int cnum;
+	unsigned long val = 0UL;
+	pfarg_reg_t *req = (pfarg_reg_t *)arg;
+	unsigned int cnum, reg_flags = 0;
 	int i, ret = 0;
 
 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;
@@ -1447,11 +1474,9 @@ pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count
 	DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));
 
 	for (i = 0; i < count; i++, req++) {
-		unsigned long ctx_val = ~0UL;
 
-		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
-
-		cnum = tmp.reg_num;
+		if (__get_user(cnum, &req->reg_num)) return -EFAULT;
+		if (__get_user(reg_flags, &req->reg_flags)) return -EFAULT;
 
 		if (!PMD_IS_IMPL(cnum)) goto abort_mission;
 		/*
@@ -1501,34 +1526,42 @@ pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count
 			 */
 
 			val &= pmu_conf.perf_ovfl_val;
-			val += ctx_val = ctx->ctx_soft_pmds[cnum].val;
+			val += ctx->ctx_soft_pmds[cnum].val;
 		} 
 
-		tmp.reg_value = val;
-
 		/*
 		 * execute read checker, if any
 		 */
 		if (PMD_RD_FUNC(cnum)) {
-			ret = PMD_RD_FUNC(cnum)(task, cnum, &tmp.reg_value, regs);
+			unsigned long v = val;
+			ret = PMD_RD_FUNC(cnum)(task, cnum, &v, regs);
+			val = v;
 		}
 
-		PFM_REG_RETFLAG_SET(tmp.reg_flags, ret);
+		PFM_REG_RETFLAG_SET(reg_flags, ret);
 
 		DBprintk(("read pmd[%u] ret=%d value=0x%lx pmc=0x%lx\n", 
 					cnum, ret, val, ia64_get_pmc(cnum)));
-
-		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
+		/*
+		 * update register return value, abort all if problem during copy.
+		 * we only modify the reg_flags field. no check mode is fine because
+		 * access has been verified upfront in sys_perfmonctl().
+		 */
+		if (__put_user(cnum, &req->reg_num)) return -EFAULT;
+		if (__put_user(val, &req->reg_value)) return -EFAULT;
+		if (__put_user(reg_flags, &req->reg_flags)) return -EFAULT;
 	}
+
 	return 0;
+
 abort_mission:
-	PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
+	PFM_REG_RETFLAG_SET(reg_flags, PFM_REG_RETFL_EINVAL);
 	/* 
 	 * XXX: if this fails, we stick with the original failure, flag not updated!
 	 */
-	copy_to_user(req, &tmp, sizeof(tmp));
-	return -EINVAL;
+	__put_user(reg_flags, &req->reg_flags);
 
+	return -EINVAL;
 }
 
 #ifdef PFM_PMU_USES_DBR
@@ -2303,21 +2336,21 @@ pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int c
  */
 static pfm_cmd_desc_t pfm_cmd_tab[]={
 /* 0  */{ NULL, 0, 0, 0}, /* not used */
-/* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
-/* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
-/* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
+/* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
+/* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
+/* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
 /* 4  */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 5  */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 6  */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 7  */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, 1, sizeof(pfarg_context_t)},
+/* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_RW, 1, sizeof(pfarg_context_t)},
 /* 9  */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 12 */{ pfm_get_features, PFM_CMD_ARG_WRITE, 0, 0},
+/* 12 */{ pfm_get_features, PFM_CMD_ARG_RW, 0, 0},
 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
 /* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
+/* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
 /* 16 */{ NULL, 0, 0, 0}, /* not used */
 /* 17 */{ NULL, 0, 0, 0}, /* not used */
 /* 18 */{ NULL, 0, 0, 0}, /* not used */
@@ -2335,8 +2368,8 @@ static pfm_cmd_desc_t pfm_cmd_tab[]={
 /* 30 */{ NULL, 0, 0, 0}, /* not used */
 /* 31 */{ NULL, 0, 0, 0}, /* not used */
 #ifdef PFM_PMU_USES_DBR
-/* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)},
-/* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}
+/* 32 */{ pfm_write_ibrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)},
+/* 33 */{ pfm_write_dbrs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, sizeof(pfarg_dbreg_t)}
 #endif
 };
 #define PFM_CMD_COUNT	(sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
@@ -2389,7 +2422,7 @@ sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6,
 
 	if (PFM_CMD_READ_ARG(cmd) && !access_ok(VERIFY_READ, arg, sz*count)) return -EFAULT;
 
-	if (PFM_CMD_WRITE_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT;
+	if (PFM_CMD_RW_ARG(cmd) && !access_ok(VERIFY_WRITE, arg, sz*count)) return -EFAULT;
 
 	if (PFM_CMD_USE_PID(cmd))  {
 		/* 
@@ -2551,8 +2584,8 @@ pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ov
 	 */
 	h->pid  = current->pid;
 	h->cpu  = smp_processor_id();
-	h->rate = 0; /* XXX: add the sampling rate used here */
-	h->ip   = regs ? regs->cr_iip : 0x0;	/* where did the fault happened */
+	h->last_reset_value = ovfl_mask ? ctx->ctx_soft_pmds[ffz(~ovfl_mask)].lval : 0UL;
+	h->ip   = regs ? regs->cr_iip : 0x0UL;	/* where did the fault happened */
 	h->regs = ovfl_mask; 			/* which registers overflowed */
 
 	/* guaranteed to monotonically increase on each cpu */
@@ -2572,8 +2605,6 @@ pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ov
 
 		if (PMD_IS_COUNTING(j)) {
 			*e  =  pfm_read_soft_counter(ctx, j);
-			/* check if this pmd overflowed as well */
-			*e +=  ovfl_mask & (1UL<<j) ? 1 + pmu_conf.perf_ovfl_val : 0;
 		} else {
 			*e = ia64_get_pmd(j); /* slow */
 		}
@@ -2674,23 +2705,13 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 			  i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));
 
 		/*
-		 * Because we sometimes (EARS/BTB) reset to a specific value, we cannot simply use
-		 * val to count the number of times we overflowed. Otherwise we would loose the 
-		 * current value in the PMD (which can be >0). So to make sure we don't loose
-		 * the residual counts we set val to contain full 64bits value of the counter.
+		 * Note that the pmd is not necessarily 0 at this point as qualified events
+		 * may have happened before the PMU was frozen. The residual count is not
+		 * taken into consideration here but will be with any read of the pmd via
+		 * pfm_read_pmds().
 		 */
 		old_val = ctx->ctx_soft_pmds[i].val;
-		ctx->ctx_soft_pmds[i].val = 1 + pmu_conf.perf_ovfl_val + pfm_read_soft_counter(ctx, i);
-
-		DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", 
-			  i, ctx->ctx_soft_pmds[i].val, old_val, 
-			  ia64_get_pmd(i) & pmu_conf.perf_ovfl_val));
-
-		/*
-		 * now that we have extracted the hardware counter, we can clear it to ensure
-		 * that a subsequent PFM_READ_PMDS will not include it again.
-		 */
-		ia64_set_pmd(i, 0UL);
+		ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.perf_ovfl_val;
 
 		/*
 		 * check for overflow condition
@@ -2699,12 +2720,15 @@ pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, str
 
 			ovfl_pmds |= 1UL << i;
 
-			DBprintk_ovfl(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds));
-
 			if (PMC_OVFL_NOTIFY(ctx, i)) {
 				ovfl_notify |= 1UL << i;
 			}
 		}
+		DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n", 
+			  i, ctx->ctx_soft_pmds[i].val, old_val, 
+			  ia64_get_pmd(i) & pmu_conf.perf_ovfl_val, ovfl_pmds, ovfl_notify));
+
+
 	}
 
 	/*
@@ -3292,6 +3316,30 @@ pfm_load_regs (struct task_struct *task)
 
 	owner = PMU_OWNER();
 	ctx   = task->thread.pfm_context;
+	t     = &task->thread;
+
+	/*
+	 * we restore ALL the debug registers to avoid picking up 
+	 * stale state.
+	 *
+	 * This must be done even when the task is still the owner
+	 * as the registers may have been modified via ptrace()
+	 * (not perfmon) by the previous task. 
+	 *
+	 * XXX: dealing with this in a lazy fashion requires modifications
+	 * to the way the the debug registers are managed. This is will done
+	 * in the next version of perfmon.
+	 */
+	if (ctx->ctx_fl_using_dbreg) {
+		for (i=0; i < pmu_conf.num_ibrs; i++) {
+			ia64_set_ibr(i, t->ibr[i]);
+		}
+		ia64_srlz_i();
+		for (i=0; i < pmu_conf.num_dbrs; i++) {
+			ia64_set_dbr(i, t->dbr[i]);
+		}
+		ia64_srlz_d();
+	}
 
 	/*
 	 * if we were the last user, then nothing to do except restore psr
@@ -3327,7 +3375,6 @@ pfm_load_regs (struct task_struct *task)
 		pfm_fetch_regs(cpu, task, ctx);
 	}
 #endif
-	t = &task->thread;
 
 	/*
 	 * To avoid leaking information to the user level when psr.sp=0,
@@ -3357,21 +3404,6 @@ pfm_load_regs (struct task_struct *task)
 		if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
 	}
 
-	/*
-	 * we restore ALL the debug registers to avoid picking up 
-	 * stale state.
-	 */
-	if (ctx->ctx_fl_using_dbreg) {
-		for (i=0; i < pmu_conf.num_ibrs; i++) {
-			ia64_set_ibr(i, t->ibr[i]);
-		}
-		ia64_srlz_i();
-		for (i=0; i < pmu_conf.num_dbrs; i++) {
-			ia64_set_dbr(i, t->dbr[i]);
-		}
-	}
-	ia64_srlz_d();
-
 	if (t->pmc[0] & ~0x1) {
 		pfm_overflow_handler(task, ctx, t->pmc[0], NULL);
 	}
@@ -3766,18 +3798,12 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 	m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
 	for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
 		if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
-			nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].ival & ~pmu_conf.perf_ovfl_val;
-			thread->pmd[i]	      	   = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val;
+			nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].lval & ~pmu_conf.perf_ovfl_val;
+			thread->pmd[i]	      	   = nctx->ctx_soft_pmds[i].lval & pmu_conf.perf_ovfl_val;
+		} else {
+			thread->pmd[i]	      	   = 0UL; /* reset to initial state */
 		}
-		/* what about the other pmds? zero or keep as is */
-
 	}
-	/*
-	 * clear BTB index register
-	 * XXX: CPU-model specific knowledge!
-	 */
-	thread->pmd[16] = 0;
-
 
 	nctx->ctx_fl_frozen    = 0;
 	nctx->ctx_ovfl_regs[0] = 0UL;
@@ -3947,7 +3973,8 @@ pfm_context_exit(struct task_struct *task)
 		pfm_sessions.pfs_sys_session[ctx->ctx_cpu] = NULL;
 		pfm_sessions.pfs_sys_sessions--;
 		DBprintk(("freeing syswide session on CPU%ld\n", ctx->ctx_cpu));
-		/* update perfmon debug register counter */
+
+		/* update perfmon debug register usage counter */
 		if (ctx->ctx_fl_using_dbreg) {
 			if (pfm_sessions.pfs_sys_use_dbregs == 0) {
 				printk("perfmon: invalid release for [%d] sys_use_dbregs=0\n", task->pid);
@@ -3990,7 +4017,8 @@ pfm_cleanup_smpl_buf(struct task_struct *task)
 	 * Walk through the list and free the sampling buffer and psb
 	 */
 	while (psb) {
-		DBprintk(("[%d] freeing smpl @%p size %ld\n", current->pid, psb->psb_hdr, psb->psb_size));
+		DBprintk(("[%d] freeing smpl @%p size %ld\n", 
+			current->pid, psb->psb_hdr, psb->psb_size));
 
 		pfm_rvfree(psb->psb_hdr, psb->psb_size);
 		tmp = psb->psb_next;
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
index 0a2082fb17b19d671b48b6f8f22dc8e497e73efc..f3926a3c4d73f0215fb137f4434174500c1c17a2 100644
--- a/arch/ia64/kernel/semaphore.c
+++ b/arch/ia64/kernel/semaphore.c
@@ -15,8 +15,8 @@
  * test if they need to do any extra work (up needs to do something
  * only if count was negative before the increment operation.
  *
- * "sleepers" and the contention routine ordering is protected by the
- * semaphore spinlock.
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
  *
  * Note that these functions are only called when there is contention
  * on the lock, and as such all this is the "non-critical" part of the
@@ -44,40 +44,42 @@ __up (struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
-
 void
 __down (struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
 	tsk->state = TASK_UNINTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
 
-	spin_lock_irq(&semaphore_lock);
 	sem->sleepers++;
 	for (;;) {
 		int sleepers = sem->sleepers;
 
 		/*
 		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock.
+		 * playing, because we own the spinlock in
+		 * the wait_queue_head.
 		 */
 		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
 			sem->sleepers = 0;
 			break;
 		}
 		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
 
 		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
 		tsk->state = TASK_UNINTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
 	}
-	spin_unlock_irq(&semaphore_lock);
-	remove_wait_queue(&sem->wait, &wait);
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	tsk->state = TASK_RUNNING;
-	wake_up(&sem->wait);
 }
 
 int
@@ -86,10 +88,12 @@ __down_interruptible (struct semaphore * sem)
 	int retval = 0;
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
 	tsk->state = TASK_INTERRUPTIBLE;
-	add_wait_queue_exclusive(&sem->wait, &wait);
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
 
-	spin_lock_irq(&semaphore_lock);
 	sem->sleepers ++;
 	for (;;) {
 		int sleepers = sem->sleepers;
@@ -110,25 +114,27 @@ __down_interruptible (struct semaphore * sem)
 
 		/*
 		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock. The
-		 * "-1" is because we're still hoping to get
-		 * the lock.
+		 * playing, because we own the spinlock in
+		 * wait_queue_head. The "-1" is because we're
+		 * still hoping to get the semaphore.
 		 */
 		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
 			sem->sleepers = 0;
 			break;
 		}
 		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irq(&semaphore_lock);
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
 
 		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
 		tsk->state = TASK_INTERRUPTIBLE;
-		spin_lock_irq(&semaphore_lock);
 	}
-	spin_unlock_irq(&semaphore_lock);
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
 	tsk->state = TASK_RUNNING;
-	remove_wait_queue(&sem->wait, &wait);
-	wake_up(&sem->wait);
 	return retval;
 }
 
@@ -142,17 +148,19 @@ __down_trylock (struct semaphore *sem)
 	unsigned long flags;
 	int sleepers;
 
-	spin_lock_irqsave(&semaphore_lock, flags);
+	spin_lock_irqsave(&sem->wait.lock, flags);
 	sleepers = sem->sleepers + 1;
 	sem->sleepers = 0;
 
 	/*
 	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock.
+	 * playing, because we own the spinlock in the
+	 * wait_queue_head.
 	 */
-	if (!atomic_add_negative(sleepers, &sem->count))
-		wake_up(&sem->wait);
+	if (!atomic_add_negative(sleepers, &sem->count)) {
+		wake_up_locked(&sem->wait);
+	}
 
-	spin_unlock_irqrestore(&semaphore_lock, flags);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 5a595596d7b71ec8deb1cc7f1e188aa52e129d54..c7530c6bec761d9d169e5d8eb59204cbb46df031 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -354,6 +354,15 @@ setup_sigcontext (struct sigcontext *sc, sigset_t *mask, struct sigscratch *scr)
 	return err;
 }
 
+/*
+ * Check whether the register-backing store is already on the signal stack.
+ */
+static inline int
+rbs_on_sig_stack (unsigned long bsp)
+{
+	return (bsp - current->sas_ss_sp < current->sas_ss_size);
+}
+
 static long
 setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 	     struct sigscratch *scr)
@@ -366,10 +375,17 @@ setup_frame (int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set,
 
 	frame = (void *) scr->pt.r12;
 	tramp_addr = GATE_ADDR + (ia64_sigtramp - __start_gate_section);
-	if ((ka->sa.sa_flags & SA_ONSTACK) != 0 && !on_sig_stack((unsigned long) frame)) {
-		new_rbs  = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
-		frame = (void *) ((current->sas_ss_sp + current->sas_ss_size)
-				  & ~(STACK_ALIGN - 1));
+	if (ka->sa.sa_flags & SA_ONSTACK) {
+		/*
+		 * We need to check the memory and register stacks separately, because
+		 * they're switched separately (memory stack is switched in the kernel,
+		 * register stack is switched in the signal trampoline).
+		 */
+		if (!on_sig_stack((unsigned long) frame))
+			frame = (void *) ((current->sas_ss_sp + current->sas_ss_size)
+					  & ~(STACK_ALIGN - 1));
+		if (!rbs_on_sig_stack(scr->pt.ar_bspstore))
+			new_rbs  = (current->sas_ss_sp + sizeof(long) - 1) & ~(sizeof(long) - 1);
 	}
 	frame = (void *) frame - ((sizeof(*frame) + STACK_ALIGN - 1) & ~(STACK_ALIGN - 1));
 
diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile
index 164a11c609b883f1a141b0975413e9e78bb10ab8..e60a441f9aa6256a1fcc8859a830049513b94614 100644
--- a/arch/ia64/lib/Makefile
+++ b/arch/ia64/lib/Makefile
@@ -15,6 +15,7 @@ obj-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o			\
 
 obj-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
 obj-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
+obj-$(CONFIG_PERFMON)	+= carta_random.o
 
 IGNORE_FLAGS_OBJS =	__divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \
 			__divdi3.o __udivdi3.o __moddi3.o __umoddi3.o
diff --git a/arch/ia64/lib/carta_random.S b/arch/ia64/lib/carta_random.S
new file mode 100644
index 0000000000000000000000000000000000000000..d0674c3603642872770af4409f9603ec01152e9e
--- /dev/null
+++ b/arch/ia64/lib/carta_random.S
@@ -0,0 +1,54 @@
+/*
+ * Fast, simple, yet decent quality random number generator based on
+ * a paper by David G. Carta ("Two Fast Implementations of the
+ * `Minimal Standard' Random Number Generator," Communications of the
+ * ACM, January, 1990).
+ *
+ * Copyright (C) 2002 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ */
+
+#include <asm/asmmacro.h>
+
+#define a	r2
+#define m	r3
+#define lo	r8
+#define hi	r9
+#define t0	r16
+#define t1	r17
+#define	seed	r32
+
+GLOBAL_ENTRY(carta_random32)
+	movl	a = (16807 << 16) | 16807
+	;;
+	pmpyshr2.u t0 = a, seed, 0
+	pmpyshr2.u t1 = a, seed, 16
+	;;
+	unpack2.l t0 = t1, t0
+	dep	m = -1, r0, 0, 31
+	;;
+	zxt4	lo = t0
+	shr.u	hi = t0, 32
+	;;
+	dep	t0 = 0, hi, 15, 49	// t0 = (hi & 0x7fff)
+	;;
+	shl	t0 = t0, 16		// t0 = (hi & 0x7fff) << 16
+	shr	t1 = hi, 15		// t1 = (hi >> 15)
+	;;
+	add	lo = lo, t0
+	;;
+	cmp.gtu	p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	;;
+	add	lo = lo, t1
+	;;
+	cmp.gtu p6, p0 = lo, m
+	;;
+(p6)	and	lo = lo, m
+	;;
+(p6)	add	lo = 1, lo
+	br.ret.sptk.many rp
+END(carta_random32)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 546c113284441e1cc9e8775d682b357e5aede9a8..c2024b1c59e2f8252b205457ee6339b60acd9671 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -78,7 +78,7 @@ ia64_init_addr_space (void)
 		vma->vm_mm = current->mm;
 		vma->vm_start = IA64_RBS_BOT;
 		vma->vm_end = vma->vm_start + PAGE_SIZE;
-		vma->vm_page_prot = PAGE_COPY;
+		vma->vm_page_prot = protection_map[VM_READ | VM_WRITE];
 		vma->vm_flags = VM_READ|VM_WRITE|VM_MAYREAD|VM_MAYWRITE|VM_GROWSUP;
 		vma->vm_ops = NULL;
 		vma->vm_pgoff = 0;
diff --git a/include/asm-ia64/pal.h b/include/asm-ia64/pal.h
index 26107d9e1e56106e689ce681878797977c843a77..840ae4bd55c4a2beb61623e4b46332eb03103673 100644
--- a/include/asm-ia64/pal.h
+++ b/include/asm-ia64/pal.h
@@ -78,6 +78,7 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
+#include <asm/fpu.h>
 
 /*
  * Data types needed to pass information into PAL procedures and
@@ -649,12 +650,43 @@ extern struct ia64_pal_retval ia64_pal_call_static (u64, u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_stacked (u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_phys_static (u64, u64, u64, u64);
 extern struct ia64_pal_retval ia64_pal_call_phys_stacked (u64, u64, u64, u64);
-
-#define PAL_CALL(iprv,a0,a1,a2,a3)		iprv = ia64_pal_call_static(a0, a1, a2, a3, 0)
-#define PAL_CALL_IC_OFF(iprv,a0,a1,a2,a3)	iprv = ia64_pal_call_static(a0, a1, a2, a3, 1)
-#define PAL_CALL_STK(iprv,a0,a1,a2,a3)		iprv = ia64_pal_call_stacked(a0, a1, a2, a3)
-#define PAL_CALL_PHYS(iprv,a0,a1,a2,a3)		iprv = ia64_pal_call_phys_static(a0, a1, a2, a3)
-#define PAL_CALL_PHYS_STK(iprv,a0,a1,a2,a3)	iprv = ia64_pal_call_phys_stacked(a0, a1, a2, a3)
+extern void ia64_save_scratch_fpregs (struct ia64_fpreg *);
+extern void ia64_load_scratch_fpregs (struct ia64_fpreg *);
+
+#define PAL_CALL(iprv,a0,a1,a2,a3) do {			\
+	struct ia64_fpreg fr[6];			\
+	ia64_save_scratch_fpregs(fr);			\
+	iprv = ia64_pal_call_static(a0, a1, a2, a3, 0);	\
+	ia64_load_scratch_fpregs(fr);			\
+} while (0)
+
+#define PAL_CALL_IC_OFF(iprv,a0,a1,a2,a3) do {		\
+	struct ia64_fpreg fr[6];			\
+	ia64_save_scratch_fpregs(fr);			\
+	iprv = ia64_pal_call_static(a0, a1, a2, a3, 1);	\
+	ia64_load_scratch_fpregs(fr);			\
+} while (0)
+
+#define PAL_CALL_STK(iprv,a0,a1,a2,a3) do {		\
+	struct ia64_fpreg fr[6];			\
+	ia64_save_scratch_fpregs(fr);			\
+	iprv = ia64_pal_call_stacked(a0, a1, a2, a3);	\
+	ia64_load_scratch_fpregs(fr);			\
+} while (0)
+
+#define PAL_CALL_PHYS(iprv,a0,a1,a2,a3) do {			\
+	struct ia64_fpreg fr[6];				\
+	ia64_save_scratch_fpregs(fr);				\
+	iprv = ia64_pal_call_phys_static(a0, a1, a2, a3);	\
+	ia64_load_scratch_fpregs(fr);				\
+} while (0)
+
+#define PAL_CALL_PHYS_STK(iprv,a0,a1,a2,a3) do {		\
+	struct ia64_fpreg fr[6];				\
+	ia64_save_scratch_fpregs(fr);				\
+	iprv = ia64_pal_call_phys_stacked(a0, a1, a2, a3);	\
+	ia64_load_scratch_fpregs(fr);				\
+} while (0)
 
 typedef int (*ia64_pal_handler) (u64, ...);
 extern ia64_pal_handler ia64_pal;
diff --git a/include/asm-ia64/perfmon.h b/include/asm-ia64/perfmon.h
index 2b95dc27e73ef63a448bb716a1df75d825fb55f6..ade86bbc0779b9f0ce90e16a657d5b6384802115 100644
--- a/include/asm-ia64/perfmon.h
+++ b/include/asm-ia64/perfmon.h
@@ -45,6 +45,7 @@
  * PMC flags
  */
 #define PFM_REGFL_OVFL_NOTIFY	0x1	/* send notification on overflow */
+#define PFM_REGFL_RANDOM	0x2	/* randomize sampling periods    */
 
 /*
  * PMD/PMC/IBR/DBR return flags (ignored on input)
@@ -86,8 +87,10 @@ typedef struct {
 	unsigned long	reg_short_reset;/* reset after counter overflow (small) */
 
 	unsigned long	reg_reset_pmds[4]; /* which other counters to reset on overflow */
+	unsigned long	reg_random_seed;   /* seed value when randomization is used */
+	unsigned long	reg_random_mask;   /* bitmask used to limit random value */
 
-	unsigned long   reserved[16];	/* for future use */
+	unsigned long   reserved[14];	/* for future use */
 } pfarg_reg_t;
 
 typedef struct {
@@ -132,28 +135,28 @@ typedef struct {
 #define PFM_VERSION_MINOR(x)	((x) & 0xffff)
 
 /*
- * Entry header in the sampling buffer.
- * The header is directly followed with the PMDS saved in increasing index 
- * order: PMD4, PMD5, .... How many PMDs are present is determined by the 
- * user program during context creation.
+ * Entry header in the sampling buffer.  The header is directly followed
+ * with the PMDs saved in increasing index order: PMD4, PMD5, .... How
+ * many PMDs are present is determined by the user program during
+ * context creation.
  *
- * XXX: in this version of the entry, only up to 64 registers can be recorded
- * This should be enough for quite some time. Always check sampling format
- * before parsing entries!
+ * XXX: in this version of the entry, only up to 64 registers can be
+ * recorded. This should be enough for quite some time. Always check
+ * sampling format before parsing entries!
  *
- * Inn the case where multiple counters have overflowed at the same time, the 
- * rate field indicate the initial value of the first PMD, based on the index.
- * For instance, if PMD2 and PMD5 have ovewrflowed for this entry, the rate field
- * will show the initial value of PMD2.
+ * In the case where multiple counters overflow at the same time, the
+ * last_reset_value member indicates the initial value of the PMD with
+ * the smallest index.  For instance, if PMD2 and PMD5 have overflowed,
+ * the last_reset_value member contains the initial value of PMD2.
  */
 typedef struct {
-	int		pid;		/* identification of process */
-	int		cpu;		/* which cpu was used */
-	unsigned long	rate;		/* initial value of overflowed counter */
-	unsigned long	stamp;		/* timestamp */
-	unsigned long	ip;		/* where did the overflow interrupt happened */
-	unsigned long	regs;		/* bitmask of which registers overflowed */
-	unsigned long   period;		/* sampling period used by overflowed counter (smallest pmd index) */
+	int		pid;			/* identification of process */
+	int		cpu;			/* which cpu was used */
+	unsigned long	last_reset_value;	/* initial value of counter that overflowed */
+	unsigned long	stamp;			/* timestamp */
+	unsigned long	ip;			/* where did the overflow interrupt happened */
+	unsigned long	regs;			/* bitmask of which registers overflowed */
+	unsigned long   period;			/* unused */
 } perfmon_smpl_entry_t;
 
 extern int perfmonctl(pid_t pid, int cmd, void *arg, int narg);
diff --git a/include/asm-ia64/rse.h b/include/asm-ia64/rse.h
index b65b28421df9a382333c994da1c15003a93d82fa..02830a3b0196dba7e7f658fc6b23ef55fd4d19d0 100644
--- a/include/asm-ia64/rse.h
+++ b/include/asm-ia64/rse.h
@@ -37,9 +37,9 @@ ia64_rse_rnat_addr (unsigned long *slot_addr)
 }
 
 /*
- * Calcuate the number of registers in the dirty partition starting at
- * BSPSTORE with a size of DIRTY bytes.  This isn't simply DIRTY
- * divided by eight because the 64th slot is used to store ar.rnat.
+ * Calculate the number of registers in the dirty partition starting at BSPSTORE and
+ * ending at BSP.  This isn't simply (BSP-BSPSTORE)/8 because every 64th slot stores
+ * ar.rnat.
  */
 static __inline__ unsigned long
 ia64_rse_num_regs (unsigned long *bspstore, unsigned long *bsp)
diff --git a/include/asm-ia64/sal.h b/include/asm-ia64/sal.h
index 841980bb30a363132f17ebf2178aa716a963b168..4be92e5b6bede67659a62e8467d60477cee85711 100644
--- a/include/asm-ia64/sal.h
+++ b/include/asm-ia64/sal.h
@@ -38,9 +38,12 @@ extern spinlock_t sal_lock;
 
 # define SAL_CALL(result,args...) do {			\
 	unsigned long flags;				\
+	struct ia64_fpreg fr[6];                        \
+	ia64_save_scratch_fpregs(fr);                   \
 	spin_lock_irqsave(&sal_lock, flags);		\
 	__SAL_CALL(result,args);			\
 	spin_unlock_irqrestore(&sal_lock, flags);	\
+	ia64_load_scratch_fpregs(fr);                   \
 } while (0)
 
 #define SAL_SET_VECTORS			0x01000000
diff --git a/include/asm-ia64/siginfo.h b/include/asm-ia64/siginfo.h
index 57addd404638537084751073754759e03abfcb0a..c418cec628b2b82d967f8f79589526454ad7aeda 100644
--- a/include/asm-ia64/siginfo.h
+++ b/include/asm-ia64/siginfo.h
@@ -66,6 +66,7 @@ typedef struct siginfo {
 			long _band;	/* POLL_IN, POLL_OUT, POLL_MSG (XPG requires a "long") */
 			int _fd;
 		} _sigpoll;
+
 		/* SIGPROF */
 		struct {
 			pid_t _pid;		/* which child */
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index f43d3c682516d9a9149b421664f2b7490d954a06..5bf4030841bc02041ba9b99bcb365e9bb9a83513 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -148,7 +148,7 @@ do {										\
 			      "cmp.ne p6,p7=%1,r0;;"				\
 			      "(p6) ssm psr.i;"					\
 			      "(p7) rsm psr.i;;"				\
-			      "srlz.d"						\
+			      "(p6) srlz.d"					\
 			      : "=&r" (old_psr) : "r"((psr) & IA64_PSR_I)	\
 			      : "p6", "p7", "memory");				\
 	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) {			\