Merge tiger.hpl.hp.com:/data1/bk/vanilla/linux-2.5

into tiger.hpl.hp.com:/data1/bk/lia64/to-linus-2.5

Merge tiger.hpl.hp.com:/data1/bk/vanilla/linux-2.5
into tiger.hpl.hp.com:/data1/bk/lia64/to-linus-2.5
fea891b3 · David Mosberger · c43626f4 · 6d9cf7b6 · fea891b3 · fea891b3
Commit fea891b3 authored May 24, 2002 by David Mosberger
38 changed files
--- a/arch/ia64/Config.help
+++ b/arch/ia64/Config.help
@@ -400,7 +400,7 @@ CONFIG_ITANIUM
  Select your IA64 processor type.  The default is Intel Itanium.

 CONFIG_MCKINLEY
-  Select this to configure for a McKinley processor.
+  Select this to configure for an Itanium 2 (McKinley) processor.

 CONFIG_IA64_GENERIC
  This selects the system type of your hardware.  A "generic" kernel

--- a/arch/ia64/Makefile
+++ b/arch/ia64/Makefile
@@ -69,13 +69,6 @@ ifdef CONFIG_IA64_SGI_SN
 				$(CORE_FILES)
 endif

-ifdef CONFIG_IA64_SOFTSDV
-        SUBDIRS         :=      arch/$(ARCH)/dig	\
-				$(SUBDIRS)
-        CORE_FILES      :=      arch/$(ARCH)/dig/dig.a	\
-				$(CORE_FILES)
-endif
-
 ifdef CONFIG_IA64_DIG
        SUBDIRS         :=      arch/$(ARCH)/dig	\
 				$(SUBDIRS)

--- a/arch/ia64/config.in
+++ b/arch/ia64/config.in
@@ -16,7 +16,7 @@ define_bool CONFIG_RWSEM_XCHGADD_ALGORITHM n

 choice 'IA-64 processor type' \
 	"Itanium			CONFIG_ITANIUM \
-	 McKinley			CONFIG_MCKINLEY" Itanium
+	 Itanium-2			CONFIG_MCKINLEY" Itanium

 choice 'IA-64 system type'					\
 	"generic		CONFIG_IA64_GENERIC		\
@@ -26,11 +26,18 @@ choice 'IA-64 system type'					\
 	 SGI-SN1		CONFIG_IA64_SGI_SN1		\
 	 SGI-SN2		CONFIG_IA64_SGI_SN2" generic

-choice 'Kernel page size'						\
+if [ "$CONFIG_ITANIUM" = "y" ]; then
+  choice 'Kernel page size'						\
+	"4KB			CONFIG_IA64_PAGE_SIZE_4KB		\
+	 8KB			CONFIG_IA64_PAGE_SIZE_8KB		\
+	 16KB			CONFIG_IA64_PAGE_SIZE_16KB" 16KB
+else
+  choice 'Kernel page size'						\
 	"4KB			CONFIG_IA64_PAGE_SIZE_4KB		\
 	 8KB			CONFIG_IA64_PAGE_SIZE_8KB		\
 	 16KB			CONFIG_IA64_PAGE_SIZE_16KB		\
 	 64KB			CONFIG_IA64_PAGE_SIZE_64KB" 16KB
+endif

 if [ "$CONFIG_IA64_HP_SIM" = "n" ]; then
  define_bool CONFIG_ACPI y

--- a/arch/ia64/hp/common/Makefile
+++ b/arch/ia64/hp/common/Makefile
@@ -12,17 +12,3 @@ export-objs := sba_iommu.o
 obj-y := sba_iommu.o

 include $(TOPDIR)/Rules.make
-#
-# ia64/platform/hp/common/Makefile
-#
-# Copyright (C) 2002 Hewlett Packard
-# Copyright (C) Alex Williamson (alex_williamson@hp.com)
-#
-
-O_TARGET := common.o
-
-export-objs := sba_iommu.o
-
-obj-y := sba_iommu.o
-
-include $(TOPDIR)/Rules.make
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1389,6 +1389,12 @@ sba_dma_address (struct scatterlist *sg)
 	return ((unsigned long)sba_sg_iova(sg));
 }

+int
+sba_dma_supported (struct pci_dev *dev, u64 mask)
+{
+	return 1;
+}
+
 /**************************************************************
 *
 *   Initialization and claim
@@ -1858,5 +1864,6 @@ EXPORT_SYMBOL(sba_unmap_single);
 EXPORT_SYMBOL(sba_map_sg);
 EXPORT_SYMBOL(sba_unmap_sg);
 EXPORT_SYMBOL(sba_dma_address);
+EXPORT_SYMBOL(sba_dma_supported);
 EXPORT_SYMBOL(sba_alloc_consistent);
 EXPORT_SYMBOL(sba_free_consistent);
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -7,9 +7,9 @@
 * case means sys_sim.c console (goes via the simulator). The code hereafter
 * is completely leveraged from the serial.c driver.
 *
- * Copyright (C) 1999-2000 Hewlett-Packard Co
- * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
- * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 1999-2000, 2002 Hewlett-Packard Co
+ *	Stephane Eranian <eranian@hpl.hp.com>
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
 *
 * 02/04/00 D. Mosberger	Merged in serial.c bug fixes in rs_close().
 * 02/25/00 D. Mosberger	Synced up with 2.3.99pre-5 version of serial.c.
@@ -24,7 +24,7 @@
 #include <linux/major.h>
 #include <linux/fcntl.h>
 #include <linux/mm.h>
-#include <linux/malloc.h>
+#include <linux/slab.h>
 #include <linux/console.h>
 #include <linux/module.h>
 #include <linux/serial.h>

--- a/arch/ia64/hp/zx1/Makefile
+++ b/arch/ia64/hp/zx1/Makefile
@@ -11,16 +11,3 @@ obj-y := hpzx1_misc.o
 obj-$(CONFIG_IA64_GENERIC) += hpzx1_machvec.o

 include $(TOPDIR)/Rules.make
-#
-# ia64/platform/hp/zx1/Makefile
-#
-# Copyright (C) 2002 Hewlett Packard
-# Copyright (C) Alex Williamson (alex_williamson@hp.com)
-#
-
-O_TARGET := zx1.o
-
-obj-y := hpzx1_misc.o
-obj-$(CONFIG_IA64_GENERIC) += hpzx1_machvec.o
-
-include $(TOPDIR)/Rules.make
--- a/arch/ia64/hp/zx1/hpzx1_misc.c
+++ b/arch/ia64/hp/zx1/hpzx1_misc.c
@@ -198,7 +198,6 @@ extern acpi_status acpi_get_crs(acpi_handle, acpi_buffer *);
 extern acpi_resource *acpi_get_crs_next(acpi_buffer *, int *);
 extern acpi_resource_data *acpi_get_crs_type(acpi_buffer *, int *, int);
 extern void acpi_dispose_crs(acpi_buffer *);
-extern acpi_status acpi_cf_evaluate_method(acpi_handle, UINT8 *, NATIVE_UINT *);

 static acpi_status
 hp_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
@@ -388,407 +387,7 @@ hpzx1_acpi_dev_init(void)
 }

 extern void sba_init(void);
-	
-void
-hpzx1_pci_fixup (int phase)
-{
-	if (phase == 0)
-		hpzx1_acpi_dev_init();
-	iosapic_pci_fixup(phase);
-        if (phase == 1)
-		sba_init();
-}
-/*
- * Misc. support for HP zx1 chipset support
- *
- * Copyright (C) 2002 Hewlett-Packard Co
- * Copyright (C) 2002 Alex Williamson <alex_williamson@hp.com>
- * Copyright (C) 2002 Bjorn Helgaas <bjorn_helgaas@hp.com>
- */
-
-
-#include <linux/config.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <asm/iosapic.h>
-#include <asm/efi.h>
-
-#include "../drivers/acpi/include/platform/acgcc.h"
-#include "../drivers/acpi/include/actypes.h"
-#include "../drivers/acpi/include/acexcep.h"
-#include "../drivers/acpi/include/acpixf.h"
-#include "../drivers/acpi/include/actbl.h"
-#include "../drivers/acpi/include/acconfig.h"
-#include "../drivers/acpi/include/acmacros.h"
-#include "../drivers/acpi/include/aclocal.h"
-#include "../drivers/acpi/include/acobject.h"
-#include "../drivers/acpi/include/acstruct.h"
-#include "../drivers/acpi/include/acnamesp.h"
-#include "../drivers/acpi/include/acutils.h"
-
-#define PFX "hpzx1: "
-
-struct fake_pci_dev {
-	struct fake_pci_dev *next;
-	unsigned char bus;
-	unsigned int devfn;
-	int sizing;		// in middle of BAR sizing operation?
-	unsigned long csr_base;
-	unsigned int csr_size;
-	unsigned long mapped_csrs;	// ioremapped
-};
-
-static struct fake_pci_dev *fake_pci_head, **fake_pci_tail = &fake_pci_head;
-
-static struct pci_ops orig_pci_ops;
-
-static inline struct fake_pci_dev *
-fake_pci_find_slot(unsigned char bus, unsigned int devfn)
-{
-	struct fake_pci_dev *dev;
-
-	for (dev = fake_pci_head; dev; dev = dev->next)
-		if (dev->bus == bus && dev->devfn == devfn)
-			return dev;
-	return NULL;
-}
-
-static struct fake_pci_dev *
-alloc_fake_pci_dev(void)
-{
-        struct fake_pci_dev *dev;
-
-        dev = kmalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return NULL;
-
-	memset(dev, 0, sizeof(*dev));
-
-        *fake_pci_tail = dev;
-        fake_pci_tail = &dev->next;
-
-        return dev;
-}
-
-#define HP_CFG_RD(sz, bits, name) \
-static int hp_cfg_read##sz (struct pci_dev *dev, int where, u##bits *value) \
-{ \
-	struct fake_pci_dev *fake_dev; \
-	if (!(fake_dev = fake_pci_find_slot(dev->bus->number, dev->devfn))) \
-		return orig_pci_ops.name(dev, where, value); \
-	\
-	switch (where) { \
-	case PCI_COMMAND: \
-		*value = read##sz(fake_dev->mapped_csrs + where); \
-		*value |= PCI_COMMAND_MEMORY; /* SBA omits this */ \
-		break; \
-	case PCI_BASE_ADDRESS_0: \
-		if (fake_dev->sizing) \
-			*value = ~(fake_dev->csr_size - 1); \
-		else \
-			*value = (fake_dev->csr_base & \
-				    PCI_BASE_ADDRESS_MEM_MASK) | \
-				PCI_BASE_ADDRESS_SPACE_MEMORY; \
-		fake_dev->sizing = 0; \
-		break; \
-	default: \
-		*value = read##sz(fake_dev->mapped_csrs + where); \
-		break; \
-	} \
-	return PCIBIOS_SUCCESSFUL; \
-}

-#define HP_CFG_WR(sz, bits, name) \
-static int hp_cfg_write##sz (struct pci_dev *dev, int where, u##bits value) \
-{ \
-	struct fake_pci_dev *fake_dev; \
-	if (!(fake_dev = fake_pci_find_slot(dev->bus->number, dev->devfn))) \
-		return orig_pci_ops.name(dev, where, value); \
-	\
-	switch (where) { \
-	case PCI_BASE_ADDRESS_0: \
-		if (value == ~0) \
-			fake_dev->sizing = 1; \
-		break; \
-	default: \
-		write##sz(value, fake_dev->mapped_csrs + where); \
-		break; \
-	} \
-	return PCIBIOS_SUCCESSFUL; \
-}
-
-HP_CFG_RD(b,  8, read_byte)
-HP_CFG_RD(w, 16, read_word)
-HP_CFG_RD(l, 32, read_dword)
-HP_CFG_WR(b,  8, write_byte)
-HP_CFG_WR(w, 16, write_word)
-HP_CFG_WR(l, 32, write_dword)
-
-static struct pci_ops hp_pci_conf = {
-	hp_cfg_readb,
-	hp_cfg_readw,
-	hp_cfg_readl,
-	hp_cfg_writeb,
-	hp_cfg_writew,
-	hp_cfg_writel,
-};
-
-/*
- * Assume we'll never have a physical slot higher than 0x10, so we can
- * use slots above that for "fake" PCI devices to represent things
- * that only show up in the ACPI namespace.
- */
-#define HP_MAX_SLOT	0x10
-
-static struct fake_pci_dev *
-hpzx1_fake_pci_dev(unsigned long addr, unsigned int bus, unsigned int size)
-{
-	struct fake_pci_dev *dev;
-	int slot;
-
-	// Note: lspci thinks 0x1f is invalid
-	for (slot = 0x1e; slot > HP_MAX_SLOT; slot--) {
-		if (!fake_pci_find_slot(bus, PCI_DEVFN(slot, 0)))
-			break;
-	}
-	if (slot == HP_MAX_SLOT) {
-		printk(KERN_ERR PFX
-			"no slot space for device (0x%p) on bus 0x%02x\n",
-			(void *) addr, bus);
-		return NULL;
-	}
-
-	dev = alloc_fake_pci_dev();
-	if (!dev) {
-		printk(KERN_ERR PFX
-			"no memory for device (0x%p) on bus 0x%02x\n",
-			(void *) addr, bus);
-		return NULL;
-	}
-
-	dev->bus = bus;
-	dev->devfn = PCI_DEVFN(slot, 0);
-	dev->csr_base = addr;
-	dev->csr_size = size;
-
-	/*
-	 * Drivers should ioremap what they need, but we have to do
-	 * it here, too, so PCI config accesses work.
-	 */
-	dev->mapped_csrs = (unsigned long) ioremap(dev->csr_base, dev->csr_size);
-
-	return dev;
-}
-
-typedef struct {
-	u8	guid_id;
-	u8	guid[16];
-	u8	csr_base[8];
-	u8	csr_length[8];
-} acpi_hp_vendor_long;
-
-#define HP_CCSR_LENGTH 0x21
-#define HP_CCSR_TYPE 0x2
-#define HP_CCSR_GUID \
-    ((efi_guid_t) { 0x69e9adf9, 0x924f, 0xab5f, { 0xf6, 0x4a, 0x24, 0xd2, 0x01, 0x37, 0x0e, 0xad }})
-
-extern acpi_status acpi_get_crs(acpi_handle, acpi_buffer *);
-extern acpi_resource *acpi_get_crs_next(acpi_buffer *, int *);
-extern acpi_resource_data *acpi_get_crs_type(acpi_buffer *, int *, int);
-extern void acpi_dispose_crs(acpi_buffer *);
-extern acpi_status acpi_cf_evaluate_method(acpi_handle, UINT8 *, NATIVE_UINT *);
-
-static acpi_status
-hp_csr_space(acpi_handle obj, u64 *csr_base, u64 *csr_length)
-{
-	int i, offset = 0;
-	acpi_status status;
-	acpi_buffer buf;
-	acpi_resource_vendor *res;
-	acpi_hp_vendor_long *hp_res;
-	efi_guid_t vendor_guid;
-
-	*csr_base = 0;
-	*csr_length = 0;
-
-	status = acpi_get_crs(obj, &buf);
-	if (status != AE_OK) {
-		printk(KERN_ERR PFX "Unable to get _CRS data on object\n");
-		return status;
-	}
-
-	res = (acpi_resource_vendor *)acpi_get_crs_type(&buf, &offset, ACPI_RSTYPE_VENDOR);
-	if (!res) {
-		printk(KERN_ERR PFX "Failed to find config space for device\n");
-		acpi_dispose_crs(&buf);
-		return AE_NOT_FOUND;
-	}
-
-	hp_res = (acpi_hp_vendor_long *)(res->reserved);
-
-	if (res->length != HP_CCSR_LENGTH || hp_res->guid_id != HP_CCSR_TYPE) {
-		printk(KERN_ERR PFX "Unknown Vendor data\n");
-		acpi_dispose_crs(&buf);
-		return AE_TYPE; /* Revisit error? */
-	}
-
-	memcpy(&vendor_guid, hp_res->guid, sizeof(efi_guid_t));
-	if (efi_guidcmp(vendor_guid, HP_CCSR_GUID) != 0) {
-		printk(KERN_ERR PFX "Vendor GUID does not match\n");
-		acpi_dispose_crs(&buf);
-		return AE_TYPE; /* Revisit error? */
-	}
-
-	for (i = 0 ; i < 8 ; i++) {
-		*csr_base |= ((u64)(hp_res->csr_base[i]) << (i * 8));
-		*csr_length |= ((u64)(hp_res->csr_length[i]) << (i * 8));
-	}
-
-	acpi_dispose_crs(&buf);
-
-	return AE_OK;
-}
-
-static acpi_status
-hpzx1_sba_probe(acpi_handle obj, u32 depth, void *context, void **ret)
-{
-	u64 csr_base = 0, csr_length = 0;
-	char *name = context;
-	struct fake_pci_dev *dev;
-	acpi_status status;
-
-	status = hp_csr_space(obj, &csr_base, &csr_length);
-
-	if (status != AE_OK)
-		return status;
-
-	/*
-	 * Only SBA shows up in ACPI namespace, so its CSR space
-	 * includes both SBA and IOC.  Make SBA and IOC show up
-	 * separately in PCI space.
-	 */
-	if ((dev = hpzx1_fake_pci_dev(csr_base, 0, 0x1000)))
-		printk(KERN_INFO PFX "%s SBA at 0x%lx; pci dev %02x:%02x.%d\n",
-			name, csr_base, dev->bus,
-			PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-	if ((dev = hpzx1_fake_pci_dev(csr_base + 0x1000, 0, 0x1000)))
-		printk(KERN_INFO PFX "%s IOC at 0x%lx; pci dev %02x:%02x.%d\n",
-			name, csr_base + 0x1000, dev->bus,
-			PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-
-	return AE_OK;
-}
-
-static acpi_status
-hpzx1_lba_probe(acpi_handle obj, u32 depth, void *context, void **ret)
-{
-	acpi_status status;
-	u64 csr_base = 0, csr_length = 0;
-	char *name = context;
-	NATIVE_UINT busnum = 0;
-	struct fake_pci_dev *dev;
-
-	status = hp_csr_space(obj, &csr_base, &csr_length);
-
-	if (status != AE_OK)
-		return status;
-
-	status = acpi_cf_evaluate_method(obj, METHOD_NAME__BBN, &busnum);
-	if (ACPI_FAILURE(status)) {
-		printk(KERN_ERR PFX "evaluate _BBN fail=0x%x\n", status);
-		busnum = 0;	// no _BBN; stick it on bus 0
-	}
-
-	if ((dev = hpzx1_fake_pci_dev(csr_base, busnum, csr_length)))
-		printk(KERN_INFO PFX "%s LBA at 0x%lx, _BBN 0x%02x; "
-			"pci dev %02x:%02x.%d\n",
-			name, csr_base, busnum, dev->bus,
-			PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
-
-	return AE_OK;
-}
-
-static void
-hpzx1_acpi_dev_init(void)
-{
-	extern struct pci_ops pci_conf;
-
-	/*
-	 * Make fake PCI devices for the following hardware in the
-	 * ACPI namespace.  This makes it more convenient for drivers
-	 * because they can claim these devices based on PCI
-	 * information, rather than needing to know about ACPI.  The
-	 * 64-bit "HPA" space for this hardware is available as BAR
-	 * 0/1.
-	 *
-	 * HWP0001: Single IOC SBA w/o IOC in namespace
-	 * HWP0002: LBA device
-	 * HWP0003: AGP LBA device
-	 */
-	acpi_get_devices("HWP0001", hpzx1_sba_probe, "HWP0001", NULL);
-#ifdef CONFIG_IA64_HP_PROTO
-	if (fake_pci_tail != &fake_pci_head) {
-#endif
-	acpi_get_devices("HWP0002", hpzx1_lba_probe, "HWP0002", NULL);
-	acpi_get_devices("HWP0003", hpzx1_lba_probe, "HWP0003", NULL);
-
-#ifdef CONFIG_IA64_HP_PROTO
-	}
-
-#define ZX1_FUNC_ID_VALUE    (PCI_DEVICE_ID_HP_ZX1_SBA << 16) | PCI_VENDOR_ID_HP
-	/*
-	 * Early protos don't have bridges in the ACPI namespace, so
-	 * if we didn't find anything, add the things we know are
-	 * there.
-	 */
-	if (fake_pci_tail == &fake_pci_head) {
-		u64 hpa, csr_base;
-		struct fake_pci_dev *dev;
-
-		csr_base = 0xfed00000UL;
-		hpa = (u64) ioremap(csr_base, 0x1000);
-		if (__raw_readl(hpa) == ZX1_FUNC_ID_VALUE) {
-			if ((dev = hpzx1_fake_pci_dev(csr_base, 0, 0x1000)))
-				printk(KERN_INFO PFX "HWP0001 SBA at 0x%lx; "
-					"pci dev %02x:%02x.%d\n", csr_base,
-					dev->bus, PCI_SLOT(dev->devfn),
-					PCI_FUNC(dev->devfn));
-			if ((dev = hpzx1_fake_pci_dev(csr_base + 0x1000, 0,
-					0x1000)))
-				printk(KERN_INFO PFX "HWP0001 IOC at 0x%lx; "
-					"pci dev %02x:%02x.%d\n",
-					csr_base + 0x1000,
-					dev->bus, PCI_SLOT(dev->devfn),
-					PCI_FUNC(dev->devfn));
-
-			csr_base = 0xfed24000UL;
-			iounmap(hpa);
-			hpa = (u64) ioremap(csr_base, 0x1000);
-			if ((dev = hpzx1_fake_pci_dev(csr_base, 0x40, 0x1000)))
-				printk(KERN_INFO PFX "HWP0003 AGP LBA at "
-					"0x%lx; pci dev %02x:%02x.%d\n",
-					csr_base,
-					dev->bus, PCI_SLOT(dev->devfn),
-					PCI_FUNC(dev->devfn));
-		}
-		iounmap(hpa);
-	}
-#endif
-
-	if (fake_pci_tail == &fake_pci_head)
-		return;
-
-	/*
-	 * Replace PCI ops, but only if we made fake devices.
-	 */
-	orig_pci_ops = pci_conf;
-	pci_conf = hp_pci_conf;
-}
-
-extern void sba_init(void);
-	
 void
 hpzx1_pci_fixup (int phase)
 {

--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -660,10 +660,10 @@ acpi_get_prt (struct pci_vector_struct **vectors, int *count)

 	list_for_each(node, &acpi_prts.entries) {
 		entry = (struct acpi_prt_entry *)node;
-		vector[i].bus    = (u16) entry->id.bus;
-		vector[i].pci_id = (u32) entry->id.dev << 16 | 0xffff;
-		vector[i].pin    = (u8)  entry->id.pin;
-		vector[i].irq    = (u8)  entry->source.index;
+		vector[i].bus    = entry->id.bus;
+		vector[i].pci_id = ((u32) entry->id.dev) << 16 | 0xffff;
+		vector[i].pin    = entry->id.pin;
+		vector[i].irq    = entry->source.index;
 		i++;
 	}
 	*count = acpi_prts.count;

--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -5,7 +5,7 @@
 *
 * Copyright (C) 1999 VA Linux Systems
 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999-2001 Hewlett-Packard Co.
+ * Copyright (C) 1999-2002 Hewlett-Packard Co.
 *	David Mosberger-Tang <davidm@hpl.hp.com>
 *	Stephane Eranian <eranian@hpl.hp.com>
 *
@@ -212,8 +212,8 @@ efi_map_pal_code (void)
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
 	u64 efi_desc_size;
-	int pal_code_count=0;
-	u64 mask, flags;
+	int pal_code_count = 0;
+	u64 mask, psr;
 	u64 vaddr;

 	efi_map_start = __va(ia64_boot_param->efi_memmap);
@@ -266,10 +266,10 @@ efi_map_pal_code (void)
 		/*
 		 * Cannot write to CRx with PSR.ic=1
 		 */
-		ia64_clear_ic(flags);
+		psr = ia64_clear_ic();
 		ia64_itr(0x1, IA64_TR_PALCODE, vaddr & mask,
 			 pte_val(pfn_pte(md->phys_addr >> PAGE_SHIFT, PAGE_KERNEL)), IA64_GRANULE_SHIFT);
-		local_irq_restore(flags);
+		ia64_set_psr(psr);
 		ia64_srlz_i();
 	}
 }
@@ -485,7 +485,7 @@ efi_get_iobase (void)
 }

 u32
-efi_mem_type (u64 phys_addr)
+efi_mem_type (unsigned long phys_addr)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;
@@ -506,7 +506,7 @@ efi_mem_type (u64 phys_addr)
 }

 u64
-efi_mem_attributes (u64 phys_addr)
+efi_mem_attributes (unsigned long phys_addr)
 {
 	void *efi_map_start, *efi_map_end, *p;
 	efi_memory_desc_t *md;

--- a/arch/ia64/kernel/efi_stub.S
+++ b/arch/ia64/kernel/efi_stub.S
@@ -53,23 +53,21 @@ GLOBAL_ENTRY(efi_call_phys)
 	mov loc4=ar.rsc			// save RSE configuration
 	mov ar.rsc=0			// put RSE in enforced lazy, LE mode
 	;;
-
 	ld8 gp=[in0]			// load EFI function's global pointer
-	mov out0=in1
-	mov out1=in2
 	movl r16=PSR_BITS_TO_CLEAR
-
 	mov loc3=psr			// save processor status word
 	movl r17=PSR_BITS_TO_SET
 	;;
-	mov out2=in3
 	or loc3=loc3,r17
 	mov b6=r2
 	;;
 	andcm r16=loc3,r16		// get psr with IT, DT, and RT bits cleared
-	mov out3=in4
 	br.call.sptk.many rp=ia64_switch_mode
 .ret0:	mov out4=in5
+	mov out0=in1
+	mov out1=in2
+	mov out2=in3
+	mov out3=in4
 	mov out5=in6
 	mov out6=in7
 	br.call.sptk.many rp=b6		// call the EFI function

--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -13,7 +13,7 @@
 #include <asm/unistd.h>
 #include <asm/page.h>

-	.section .text.gate,"ax"
+	.section .text.gate, "ax"

 #	define ARG0_OFF		(16 + IA64_SIGFRAME_ARG0_OFFSET)
 #	define ARG1_OFF		(16 + IA64_SIGFRAME_ARG1_OFFSET)
@@ -108,7 +108,7 @@ back_from_setup_rbs:
 	dep r8=0,r8,38,26			// clear EC0, CPL0 and reserved bits
 	adds base1=(FR6_OFF+16+SIGCONTEXT_OFF),sp
 	;;
-	.spillsp ar.pfs, CFM_OFF
+	.spillsp ar.pfs, CFM_OFF+SIGCONTEXT_OFF
 	st8 [base0]=r8				// save CFM0
 	adds base0=(FR6_OFF+SIGCONTEXT_OFF),sp
 	;;

--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -24,6 +24,7 @@
 *                              /proc/irq/#/smp_affinity
 * 02/04/02	P. Diefenbaugh	Cleaned up ACPI PCI IRQ routing.
 * 02/04/18	J.I. Lee	bug fix in iosapic_init_pci_irq
+ * 02/04/30	J.I. Lee	bug fix in find_iosapic to fix ACPI PCI IRQ to IOSAPIC mapping error
 */
 /*
 * Here is what the interrupt logic between a PCI device and the CPU looks like:
@@ -112,7 +113,7 @@ find_iosapic (unsigned int irq)
 	int i;

 	for (i = 0; i < num_iosapic; i++) {
-		if ((irq - iosapic_lists[i].base_irq) < iosapic_lists[i].max_pin)
+		if ((unsigned) (irq - iosapic_lists[i].base_irq) <= iosapic_lists[i].max_pin)
 			return i;
 	}

@@ -138,7 +139,7 @@ iosapic_irq_to_vector (int irq)
 * Map PCI pin to the corresponding IA-64 interrupt vector.  If no such mapping exists,
 * return -1.
 */
-static int
+int
 pci_pin_to_vector (int bus, int slot, int pci_pin)
 {
 	struct pci_vector_struct *r;

--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -1197,7 +1197,7 @@ static void register_irq_proc (unsigned int irq)
 {
 	char name [MAX_NAMELEN];

-	if (!root_irq_dir || (irq_desc(irq)->handler == &no_irq_type))
+	if (!root_irq_dir || (irq_desc(irq)->handler == &no_irq_type) || irq_dir[irq])
 		return;

 	memset(name, 0, MAX_NAMELEN);

--- a/arch/ia64/kernel/pal.S
+++ b/arch/ia64/kernel/pal.S
@@ -216,7 +216,7 @@ GLOBAL_ENTRY(ia64_pal_call_phys_stacked)
 	mov out3 = in3		// copy arg3
 	;;
 	mov loc3 = psr		// save psr
-	;; 
+	;;
 	mov loc4=ar.rsc			// save RSE configuration
 	dep.z loc2=loc2,0,61		// convert pal entry point to physical
 	;;

--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -23,6 +23,7 @@
 #include <linux/vmalloc.h>
 #include <linux/wrapper.h>
 #include <linux/mm.h>
+#include <linux/sysctl.h>

 #include <asm/bitops.h>
 #include <asm/errno.h>
@@ -38,11 +39,11 @@
 #ifdef CONFIG_PERFMON

 /*
- * For PMUs which rely on the debug registers for some features, you
- * must enable the following flag to activate the support for
+ * For PMUs which rely on the debug registers for some features, you must
+ * you must enable the following flag to activate the support for
 * accessing the registers via the perfmonctl() interface.
 */
-#ifdef CONFIG_ITANIUM
+#if defined(CONFIG_ITANIUM) || defined(CONFIG_MCKINLEY)
 #define PFM_PMU_USES_DBR	1
 #endif

@@ -68,26 +69,27 @@
 #define PMC_OVFL_NOTIFY(ctx, i)	((ctx)->ctx_soft_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
 #define PFM_FL_INHERIT_MASK	(PFM_FL_INHERIT_NONE|PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)

+/* i assume unsigned */
 #define PMC_IS_IMPL(i)	  (i<pmu_conf.num_pmcs && pmu_conf.impl_regs[i>>6] & (1UL<< (i) %64))
 #define PMD_IS_IMPL(i)	  (i<pmu_conf.num_pmds &&  pmu_conf.impl_regs[4+(i>>6)] & (1UL<<(i) % 64))

-#define PMD_IS_COUNTING(i) (i >=0  && i < 256 && pmu_conf.counter_pmds[i>>6] & (1UL <<(i) % 64))
-#define PMC_IS_COUNTING(i) PMD_IS_COUNTING(i)
+/* XXX: these three assume that register i is implemented */
+#define PMD_IS_COUNTING(i) (pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING)
+#define PMC_IS_COUNTING(i) (pmu_conf.pmc_desc[i].type == PFM_REG_COUNTING)
+#define PMC_IS_MONITOR(c)  (pmu_conf.pmc_desc[i].type == PFM_REG_MONITOR)

+/* k assume unsigned */
 #define IBR_IS_IMPL(k)	  (k<pmu_conf.num_ibrs)
 #define DBR_IS_IMPL(k)	  (k<pmu_conf.num_dbrs)

-#define PMC_IS_BTB(a)	  (((pfm_monitor_t *)(a))->pmc_es == PMU_BTB_EVENT)
-
-#define LSHIFT(x)		(1UL<<(x))
-#define PMM(x)			LSHIFT(x)
-#define PMC_IS_MONITOR(c)	((pmu_conf.monitor_pmcs[0] & PMM((c))) != 0)
-
 #define CTX_IS_ENABLED(c) 	((c)->ctx_flags.state == PFM_CTX_ENABLED)
 #define CTX_OVFL_NOBLOCK(c)	((c)->ctx_fl_block == 0)
 #define CTX_INHERIT_MODE(c)	((c)->ctx_fl_inherit)
 #define CTX_HAS_SMPL(c)		((c)->ctx_psb != NULL)
-#define CTX_USED_PMD(ctx,n) 	(ctx)->ctx_used_pmds[(n)>>6] |= 1UL<< ((n) % 64)
+/* XXX: does not support more than 64 PMDs */
+#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
+#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
+

 #define CTX_USED_IBR(ctx,n) 	(ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
 #define CTX_USED_DBR(ctx,n) 	(ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
@@ -109,12 +111,18 @@
 */
 #define DBprintk(a) \
 	do { \
-		if (pfm_debug_mode >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+		if (pfm_sysctl.debug >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
 	} while (0)

+#define DBprintk_ovfl(a) \
+	do { \
+		if (pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
+	} while (0)
+
+

 /* 
- * These are some helpful architected PMC and IBR/DBR register layouts
+ * Architected PMC structure
 */
 typedef struct {
 	unsigned long pmc_plm:4;	/* privilege level mask */
@@ -139,41 +147,40 @@ typedef struct {
 typedef struct _pfm_smpl_buffer_desc {
 	spinlock_t		psb_lock;	/* protection lock */
 	unsigned long		psb_refcnt;	/* how many users for the buffer */
-	int			psb_flags;	/* bitvector of flags */
+	int			psb_flags;	/* bitvector of flags (not yet used) */

 	void			*psb_addr;	/* points to location of first entry */
 	unsigned long		psb_entries;	/* maximum number of entries */
 	unsigned long		psb_size;	/* aligned size of buffer */
 	unsigned long		psb_index;	/* next free entry slot XXX: must use the one in buffer */
 	unsigned long		psb_entry_size;	/* size of each entry including entry header */
+
 	perfmon_smpl_hdr_t	*psb_hdr;	/* points to sampling buffer header */

 	struct _pfm_smpl_buffer_desc *psb_next;	/* next psb, used for rvfreeing of psb_hdr */

 } pfm_smpl_buffer_desc_t;

+/*
+ * psb_flags
+ */
+#define PSB_HAS_VMA	0x1		/* a virtual mapping for the buffer exists */
+
 #define LOCK_PSB(p)	spin_lock(&(p)->psb_lock)
 #define UNLOCK_PSB(p)	spin_unlock(&(p)->psb_lock)

-#define PFM_PSB_VMA	0x1			/* a VMA is describing the buffer */
-
 /*
- * This structure is initialized at boot time and contains
- * a description of the PMU main characteristic as indicated
- * by PAL
+ * The possible type of a PMU register
 */
-typedef struct {
-	unsigned long pfm_is_disabled;	/* indicates if perfmon is working properly */
-	unsigned long perf_ovfl_val;	/* overflow value for generic counters   */
-	unsigned long max_counters;	/* upper limit on counter pair (PMC/PMD) */
-	unsigned long num_pmcs ;	/* highest PMC implemented (may have holes) */
-	unsigned long num_pmds;		/* highest PMD implemented (may have holes) */
-	unsigned long impl_regs[16];	/* buffer used to hold implememted PMC/PMD mask */
-	unsigned long num_ibrs;		/* number of instruction debug registers */
-	unsigned long num_dbrs;		/* number of data debug registers */
-	unsigned long monitor_pmcs[4];	/* which pmc are controlling monitors */
-	unsigned long counter_pmds[4];	/* which pmd are used as counters */
-} pmu_config_t;
+typedef enum { 
+	PFM_REG_NOTIMPL, /* not implemented */
+	PFM_REG_NONE, 	 /* end marker */
+	PFM_REG_MONITOR, /* a PMC with a pmc.pm field only */
+	PFM_REG_COUNTING,/* a PMC with a pmc.pm AND pmc.oi, a PMD used as a counter */
+	PFM_REG_CONTROL, /* PMU control register */
+	PFM_REG_CONFIG,  /* refine configuration */
+	PFM_REG_BUFFER	 /* PMD used as buffer */
+} pfm_pmu_reg_type_t;

 /*
 * 64-bit software counter structure
@@ -221,9 +228,11 @@ typedef struct pfm_context {

 	struct semaphore	ctx_restart_sem;   	/* use for blocking notification mode */

-	unsigned long		ctx_used_pmds[4];	/* bitmask of used PMD (speedup ctxsw) */
-	unsigned long		ctx_saved_pmcs[4];	/* bitmask of PMC to save on ctxsw */
-	unsigned long		ctx_reload_pmcs[4];	/* bitmask of PMC to reload on ctxsw (SMP) */
+	unsigned long		ctx_used_pmds[4];	/* bitmask of PMD used                 */
+	unsigned long		ctx_reload_pmds[4];	/* bitmask of PMD to reload on ctxsw   */
+
+	unsigned long		ctx_used_pmcs[4];	/* bitmask PMC used by context         */
+	unsigned long		ctx_reload_pmcs[4];	/* bitmask of PMC to reload on ctxsw   */

 	unsigned long		ctx_used_ibrs[4];	/* bitmask of used IBR (speedup ctxsw) */
 	unsigned long		ctx_used_dbrs[4];	/* bitmask of used DBR (speedup ctxsw) */
@@ -235,6 +244,7 @@ typedef struct pfm_context {
 	unsigned long		ctx_cpu;		/* cpu to which perfmon is applied (system wide) */

 	atomic_t		ctx_saving_in_progress;	/* flag indicating actual save in progress */
+	atomic_t		ctx_is_busy;		/* context accessed by overflow handler */
 	atomic_t		ctx_last_cpu;		/* CPU id of current or last CPU used */
 } pfm_context_t;

@@ -250,15 +260,53 @@ typedef struct pfm_context {
 * mostly used to synchronize between system wide and per-process
 */
 typedef struct {
-	spinlock_t		pfs_lock;		/* lock the structure */
+	spinlock_t		pfs_lock;		   /* lock the structure */

-	unsigned long		pfs_task_sessions;	/* number of per task sessions */
-	unsigned long		pfs_sys_sessions;	/* number of per system wide sessions */
-	unsigned long   	pfs_sys_use_dbregs;	  	/* incremented when a system wide session uses debug regs */
-	unsigned long   	pfs_ptrace_use_dbregs;	  /* incremented when a process uses debug regs */
-	struct task_struct	*pfs_sys_session[NR_CPUS];  /* point to task owning a system-wide session */
+	unsigned long		pfs_task_sessions;	   /* number of per task sessions */
+	unsigned long		pfs_sys_sessions;	   /* number of per system wide sessions */
+	unsigned long   	pfs_sys_use_dbregs;	   /* incremented when a system wide session uses debug regs */
+	unsigned long   	pfs_ptrace_use_dbregs;	   /* incremented when a process uses debug regs */
+	struct task_struct	*pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
 } pfm_session_t;

+/*
+ * information about a PMC or PMD.
+ * dep_pmd[]: a bitmask of dependent PMD registers 
+ * dep_pmc[]: a bitmask of dependent PMC registers
+ */
+typedef struct {
+	pfm_pmu_reg_type_t	type;
+	int			pm_pos;
+	int			(*read_check)(struct task_struct *task, unsigned int cnum, unsigned long *val);
+	int			(*write_check)(struct task_struct *task, unsigned int cnum, unsigned long *val);
+	unsigned long		dep_pmd[4];
+	unsigned long		dep_pmc[4];
+} pfm_reg_desc_t;
+/* assume cnum is a valid monitor */
+#define PMC_PM(cnum, val)	(((val) >> (pmu_conf.pmc_desc[cnum].pm_pos)) & 0x1)
+#define PMC_WR_FUNC(cnum)	(pmu_conf.pmc_desc[cnum].write_check)
+#define PMD_WR_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].write_check)
+#define PMD_RD_FUNC(cnum)	(pmu_conf.pmd_desc[cnum].read_check)
+
+/*
+ * This structure is initialized at boot time and contains
+ * a description of the PMU main characteristic as indicated
+ * by PAL along with a list of inter-registers dependencies and configurations.
+ */
+typedef struct {
+	unsigned long pfm_is_disabled;	/* indicates if perfmon is working properly */
+	unsigned long perf_ovfl_val;	/* overflow value for generic counters   */
+	unsigned long max_counters;	/* upper limit on counter pair (PMC/PMD) */
+	unsigned long num_pmcs ;	/* highest PMC implemented (may have holes) */
+	unsigned long num_pmds;		/* highest PMD implemented (may have holes) */
+	unsigned long impl_regs[16];	/* buffer used to hold implememted PMC/PMD mask */
+	unsigned long num_ibrs;		/* number of instruction debug registers */
+	unsigned long num_dbrs;		/* number of data debug registers */
+	pfm_reg_desc_t *pmc_desc;	/* detailed PMC register descriptions */
+	pfm_reg_desc_t *pmd_desc;	/* detailed PMD register descriptions */
+} pmu_config_t;
+
+
 /*
 * structure used to pass argument to/from remote CPU 
 * using IPI to check and possibly save the PMU context on SMP systems.
@@ -301,22 +349,52 @@ typedef struct {
 #define PFM_CMD_NARG(cmd)	(pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_narg)
 #define PFM_CMD_ARG_SIZE(cmd)	(pfm_cmd_tab[PFM_CMD_IDX(cmd)].cmd_argsize)

+typedef struct {
+	int	debug;		/* turn on/off debugging via syslog */
+	int	debug_ovfl;	/* turn on/off debug printk in overflow handler */
+	int	fastctxsw;	/* turn on/off fast (unsecure) ctxsw */
+} pfm_sysctl_t;
+
+typedef struct {
+	unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
+	unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */
+	unsigned long pfm_recorded_samples_count;
+	unsigned long pfm_full_smpl_buffer_count; /* how many times the sampling buffer was full */
+} pfm_stats_t;

 /*
 * perfmon internal variables
 */
 static pmu_config_t	pmu_conf; 	/* PMU configuration */
-static int		pfm_debug_mode;	/* 0= nodebug, >0= debug output on */
 static pfm_session_t	pfm_sessions;	/* global sessions information */
 static struct proc_dir_entry *perfmon_dir; /* for debug only */
-static unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
-static unsigned long pfm_ovfl_intr_count; /* keep track of spurious ovfl interrupts */
-static unsigned long pfm_recorded_samples_count;
-
+static pfm_stats_t	pfm_stats;
+int __per_cpu_data pfm_syst_wide;
+static int __per_cpu_data pfm_dcr_pp;
+
+/* sysctl() controls */
+static pfm_sysctl_t pfm_sysctl;
+
+static ctl_table pfm_ctl_table[]={
+	{1, "debug", &pfm_sysctl.debug, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+	{2, "debug_ovfl", &pfm_sysctl.debug_ovfl, sizeof(int), 0666, NULL, &proc_dointvec, NULL,},
+	{3, "fastctxsw", &pfm_sysctl.fastctxsw, sizeof(int), 0600, NULL, &proc_dointvec, NULL,},
+	{ 0, },
+};
+static ctl_table pfm_sysctl_dir[] = {
+	{1, "perfmon", NULL, 0, 0755, pfm_ctl_table, },
+ 	{0,},
+};
+static ctl_table pfm_sysctl_root[] = {
+	{1, "kernel", NULL, 0, 0755, pfm_sysctl_dir, },
+ 	{0,},
+};
+static struct ctl_table_header *pfm_sysctl_header;

 static unsigned long reset_pmcs[IA64_NUM_PMC_REGS];	/* contains PAL reset values for PMCS */

 static void pfm_vm_close(struct vm_area_struct * area);
+
 static struct vm_operations_struct pfm_vm_ops={
 	close: pfm_vm_close
 };
@@ -339,6 +417,14 @@ static void pfm_fetch_regs(int cpu, struct task_struct *task, pfm_context_t *ctx
 #endif
 static void pfm_lazy_save_regs (struct task_struct *ta);

+#if   defined(CONFIG_ITANIUM)
+#include "perfmon_itanium.h"
+#elif defined(CONFIG_MCKINLEY)
+#include "perfmon_mckinley.h"
+#else
+#include "perfmon_generic.h"
+#endif
+
 static inline unsigned long
 pfm_read_soft_counter(pfm_context_t *ctx, int i)
 {
@@ -353,7 +439,7 @@ pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
 	 * writing to unimplemented part is ignore, so we do not need to
 	 * mask off top part
 	 */
-	ia64_set_pmd(i, val);
+	ia64_set_pmd(i, val & pmu_conf.perf_ovfl_val);
 }

 /*
@@ -388,7 +474,8 @@ pfm_get_stamp(void)
 }

 /* Here we want the physical address of the memory.
- * This is used when initializing the contents of the area.
+ * This is used when initializing the contents of the
+ * area and marking the pages as reserved.
 */
 static inline unsigned long
 pfm_kvirt_to_pa(unsigned long adr)
@@ -398,7 +485,6 @@ pfm_kvirt_to_pa(unsigned long adr)
 	return pa;
 }

-
 static void *
 pfm_rvmalloc(unsigned long size)
 {
@@ -473,7 +559,7 @@ pfm_vm_close(struct vm_area_struct *vma)
 	 *
 	 * This function cannot remove the buffer from here, because exit_mmap() must first
 	 * complete. Given that there is no other vma related callback in the generic code,
-	 * we have created on own with the linked list of sampling buffer to free which
+	 * we have created our own with the linked list of sampling buffers to free. The list
 	 * is part of the thread structure. In release_thread() we check if the list is
 	 * empty. If not we call into perfmon to free the buffer and psb. That is the only
 	 * way to ensure a safe deallocation of the sampling buffer which works when
@@ -489,16 +575,15 @@ pfm_vm_close(struct vm_area_struct *vma)
 		psb->psb_next = current->thread.pfm_smpl_buf_list;
 		current->thread.pfm_smpl_buf_list = psb;

-		DBprintk(("psb for [%d] smpl @%p size %ld inserted into list\n", 
-			current->pid, psb->psb_hdr, psb->psb_size));
+		DBprintk(("[%d] add smpl @%p size %lu to smpl_buf_list psb_flags=0x%x\n", 
+			current->pid, psb->psb_hdr, psb->psb_size, psb->psb_flags));
 	}
-	DBprintk(("psb vma flag cleared for [%d] smpl @%p size %ld inserted into list\n", 
-			current->pid, psb->psb_hdr, psb->psb_size));
-
+	DBprintk(("[%d] clearing psb_flags=0x%x smpl @%p size %lu\n", 
+			current->pid, psb->psb_flags, psb->psb_hdr, psb->psb_size));
 	/*
-	 * indicate to pfm_context_exit() that the vma has been removed. 
+	 * decrement the number vma for the buffer
 	 */
-	psb->psb_flags &= ~PFM_PSB_VMA;
+	psb->psb_flags &= ~PSB_HAS_VMA;

 	UNLOCK_PSB(psb);
 }
@@ -521,7 +606,7 @@ pfm_remove_smpl_mapping(struct task_struct *task)
 		printk("perfmon: invalid context mm=%p\n", task->mm);
 		return -1;
 	}
-	psb = ctx->ctx_psb;	
+	psb = ctx->ctx_psb;

 	down_write(&task->mm->mmap_sem);

@@ -532,14 +617,9 @@ pfm_remove_smpl_mapping(struct task_struct *task)
 		printk("perfmon: pid %d unable to unmap sampling buffer @0x%lx size=%ld\n", 
 				task->pid, ctx->ctx_smpl_vaddr, psb->psb_size);
 	}
-	DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d\n", 
-		task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r));

-	/* 
-	 * make sure we suppress all traces of this buffer
-	 * (important for pfm_inherit)
-	 */
-	ctx->ctx_smpl_vaddr = 0;
+	DBprintk(("[%d] do_unmap(0x%lx, %ld)=%d refcnt=%lu psb_flags=0x%x\n", 
+		task->pid, ctx->ctx_smpl_vaddr, psb->psb_size, r, psb->psb_refcnt, psb->psb_flags));

 	return 0;
 }
@@ -572,7 +652,7 @@ pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long ad
 	while (size > 0) {
 		page = pfm_kvirt_to_pa(buf);

-		if (remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_SHARED)) return -ENOMEM;
+		if (remap_page_range(vma, addr, page, PAGE_SIZE, PAGE_READONLY)) return -ENOMEM;
 		
 		addr  += PAGE_SIZE;
 		buf   += PAGE_SIZE;
@@ -611,17 +691,25 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	void *smpl_buf;
 	pfm_smpl_buffer_desc_t *psb;

-	regcount = pfm_smpl_entry_size(which_pmds, 1);

 	/* note that regcount might be 0, in this case only the header for each
 	 * entry will be recorded.
 	 */
+	regcount = pfm_smpl_entry_size(which_pmds, 1);
+
+	if ((sizeof(perfmon_smpl_hdr_t)+ entries*sizeof(perfmon_smpl_entry_t)) <= entries) {
+		DBprintk(("requested entries %lu is too big\n", entries));
+		return -EINVAL;
+	}

 	/*
 	 * 1 buffer hdr and for each entry a header + regcount PMDs to save
 	 */
 	size = PAGE_ALIGN(  sizeof(perfmon_smpl_hdr_t)
 			  + entries * (sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64)));
+
+	DBprintk(("sampling buffer size=%lu bytes\n", size));
+
 	/*
 	 * check requested size to avoid Denial-of-service attacks
 	 * XXX: may have to refine this test	
@@ -661,8 +749,13 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	}
 	/*
 	 * partially initialize the vma for the sampling buffer
+	 *
+	 * The VM_DONTCOPY flag is very important as it ensures that the mapping
+	 * will never be inherited for any child process (via fork()) which is always 
+	 * what we want.
 	 */
-	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
+	vma->vm_mm	     = mm;
+	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED|VM_DONTCOPY;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 	vma->vm_ops	     = &pfm_vm_ops; /* necesarry to get the close() callback */
 	vma->vm_pgoff	     = 0;
@@ -680,8 +773,8 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	psb->psb_size    = size; /* aligned size */
 	psb->psb_index   = 0;
 	psb->psb_entries = entries;
-	psb->psb_flags   = PFM_PSB_VMA; /* remember that there is a vma describing the buffer */
 	psb->psb_refcnt  = 1;
+	psb->psb_flags   = PSB_HAS_VMA;

 	spin_lock_init(&psb->psb_lock);

@@ -691,9 +784,9 @@ pfm_smpl_buffer_alloc(pfm_context_t *ctx, unsigned long *which_pmds, unsigned lo
 	 */
 	psb->psb_entry_size = sizeof(perfmon_smpl_entry_t) + regcount*sizeof(u64);

-	DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p\n", 
+	DBprintk(("psb @%p entry_size=%ld hdr=%p addr=%p refcnt=%lu psb_flags=0x%x\n", 
 		  (void *)psb,psb->psb_entry_size, (void *)psb->psb_hdr, 
-		  (void *)psb->psb_addr));
+		  (void *)psb->psb_addr, psb->psb_refcnt, psb->psb_flags));

 	/* initialize some of the fields of user visible buffer header */
 	psb->psb_hdr->hdr_version    = PFM_SMPL_VERSION;
@@ -785,6 +878,11 @@ pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
 	}
 	ctx_flags = pfx->ctx_flags;

+	if ((ctx_flags & PFM_FL_INHERIT_MASK) == (PFM_FL_INHERIT_ONCE|PFM_FL_INHERIT_ALL)) {
+		DBprintk(("invalid inherit mask 0x%x\n",ctx_flags & PFM_FL_INHERIT_MASK));
+		return -EINVAL;
+	}
+
 	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
 		DBprintk(("cpu_mask=0x%lx\n", pfx->ctx_cpu_mask));
 		/*
@@ -823,7 +921,15 @@ pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
 		 * must provide a target for the signal in blocking mode even when
 		 * no counter is configured with PFM_FL_REG_OVFL_NOTIFY
 		 */
-		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) return -EINVAL;
+		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == 0) {
+			DBprintk(("must have notify_pid when blocking for [%d]\n", task->pid));
+			return -EINVAL;
+		}
+
+		if ((ctx_flags & PFM_FL_NOTIFY_BLOCK) && pfx->ctx_notify_pid == task->pid) {
+			DBprintk(("cannot notify self when blocking for [%d]\n", task->pid));
+			return -EINVAL;
+		}
 	}
 	/* probably more to add here */

@@ -831,7 +937,7 @@ pfx_is_sane(struct task_struct *task, pfarg_context_t *pfx)
 }

 static int
-pfm_create_context(struct task_struct *task, pfm_context_t *ctx, void *req, int count, 
+pfm_context_create(struct task_struct *task, pfm_context_t *ctx, void *req, int count, 
 		   struct pt_regs *regs)
 {
 	pfarg_context_t tmp;
@@ -956,7 +1062,7 @@ pfm_create_context(struct task_struct *task, pfm_context_t *ctx, void *req, int
 	}

 	if (tmp.ctx_smpl_entries) {
-		DBprintk(("sampling entries=%ld\n",tmp.ctx_smpl_entries));
+		DBprintk(("sampling entries=%lu\n",tmp.ctx_smpl_entries));

 		ret = pfm_smpl_buffer_alloc(ctx, tmp.ctx_smpl_regs, 
 						 tmp.ctx_smpl_entries, &uaddr);
@@ -982,20 +1088,12 @@ pfm_create_context(struct task_struct *task, pfm_context_t *ctx, void *req, int

 	atomic_set(&ctx->ctx_last_cpu,-1); /* SMP only, means no CPU */

-	/* 
-	 * Keep track of the pmds we want to sample
-	 * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
-	 * but we do need the BTB for sure. This is because of a hardware
-	 * buffer of 1 only for non-BTB pmds.
-	 *
-	 * We ignore the unimplemented pmds specified by the user
-	 */
-	ctx->ctx_used_pmds[0]  = tmp.ctx_smpl_regs[0] & pmu_conf.impl_regs[4];
-	ctx->ctx_saved_pmcs[0] = 1; /* always save/restore PMC[0] */
+	/* may be redudant with memset() but at least it's easier to remember */
+	atomic_set(&ctx->ctx_saving_in_progress, 0); 
+	atomic_set(&ctx->ctx_is_busy, 0); 

 	sema_init(&ctx->ctx_restart_sem, 0); /* init this semaphore to locked */

-
 	if (copy_to_user(req, &tmp, sizeof(tmp))) {
 		ret = -EFAULT;
 		goto buffer_error;
@@ -1097,21 +1195,22 @@ pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int flag)
 			  	current->pid, 
 				flag == PFM_RELOAD_LONG_RESET ? "long" : "short", i, val));
 	}
+	ia64_srlz_d();
 	/* just in case ! */
 	ctx->ctx_ovfl_regs[0] = 0UL;
 }

 static int
-pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_write_pmcs(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *th = &ta->thread;
+	struct thread_struct *th = &task->thread;
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
 	unsigned int cnum;
 	int i;
 	int ret = 0, reg_retval = 0;

 	/* we don't quite support this right now */
-	if (ta != current) return -EINVAL;
+	if (task != current) return -EINVAL;

 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;

@@ -1140,30 +1239,30 @@ pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 		 * 	- per-task : user monitor
 		 * any other configuration is rejected.
 		 */
-		if (PMC_IS_MONITOR(cnum)) {
-			pfm_monitor_t *p = (pfm_monitor_t *)&tmp.reg_value;
+		if (PMC_IS_MONITOR(cnum) || PMC_IS_COUNTING(cnum)) {
+			DBprintk(("pmc[%u].pm=%ld\n", cnum, PMC_PM(cnum, tmp.reg_value))); 

-			DBprintk(("pmc[%u].pm = %d\n", cnum, p->pmc_pm));
-
-			if (ctx->ctx_fl_system ^ p->pmc_pm) {
-			//if ((ctx->ctx_fl_system == 1 && p->pmc_pm == 0)
-			 //  ||(ctx->ctx_fl_system == 0 && p->pmc_pm == 1)) {
+			if (ctx->ctx_fl_system ^ PMC_PM(cnum, tmp.reg_value)) {
+				DBprintk(("pmc_pm=%ld fl_system=%d\n", PMC_PM(cnum, tmp.reg_value), ctx->ctx_fl_system));
 				ret = -EINVAL;
 				goto abort_mission;
 			}
-			/*
-			 * enforce generation of overflow interrupt. Necessary on all
-			 * CPUs which do not implement 64-bit hardware counters.
-			 */
-			p->pmc_oi = 1;
 		}

 		if (PMC_IS_COUNTING(cnum)) {
+			pfm_monitor_t *p = (pfm_monitor_t *)&tmp.reg_value;
+			/*
+		 	 * enforce generation of overflow interrupt. Necessary on all
+		 	 * CPUs.
+		 	 */
+			p->pmc_oi = 1;
+
 			if (tmp.reg_flags & PFM_REGFL_OVFL_NOTIFY) {
 				/*
 				 * must have a target for the signal
 				 */
 				if (ctx->ctx_notify_task == NULL) {
+					DBprintk(("no notify_task && PFM_REGFL_OVFL_NOTIFY\n"));
 					ret = -EINVAL;
 					goto abort_mission;
 				}
@@ -1177,14 +1276,11 @@ pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 			ctx->ctx_soft_pmds[cnum].reset_pmds[1] = tmp.reg_reset_pmds[1];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[2] = tmp.reg_reset_pmds[2];
 			ctx->ctx_soft_pmds[cnum].reset_pmds[3] = tmp.reg_reset_pmds[3];
-
-			/*
-			 * needed in case the user does not initialize the equivalent
-			 * PMD. Clearing is done in reset_pmu() so there is no possible
-			 * leak here.
-			 */
-			CTX_USED_PMD(ctx, cnum);
 		}
+		/*
+		 * execute write checker, if any
+		 */
+		if (PMC_WR_FUNC(cnum)) ret = PMC_WR_FUNC(cnum)(task, cnum, &tmp.reg_value);
 abort_mission:
 		if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL;

@@ -1204,14 +1300,21 @@ pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 		 */
 		if (ret != 0) {
 			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
-				  ta->pid, cnum, tmp.reg_value, reg_retval));
+				  task->pid, cnum, tmp.reg_value, reg_retval));
 			break;
 		}

 		/* 
 		 * We can proceed with this register!
 		 */
-		
+
+		/*
+		 * Needed in case the user does not initialize the equivalent
+		 * PMD. Clearing is done in reset_pmu() so there is no possible
+		 * leak here.
+		 */
+		CTX_USED_PMD(ctx, pmu_conf.pmc_desc[cnum].dep_pmd[0]);
+
 		/* 
 		 * keep copy the pmc, used for register reload
 		 */
@@ -1219,17 +1322,17 @@ pfm_write_pmcs(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,

 		ia64_set_pmc(cnum, tmp.reg_value);

-		DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x save_pmcs=0%lx reload_pmcs=0x%lx\n", 
-			  ta->pid, cnum, tmp.reg_value, 
+		DBprintk(("[%d] pmc[%u]=0x%lx flags=0x%x used_pmds=0x%lx\n", 
+			  task->pid, cnum, tmp.reg_value, 
 			  ctx->ctx_soft_pmds[cnum].flags, 
-			  ctx->ctx_saved_pmcs[0], ctx->ctx_reload_pmcs[0]));
+			  ctx->ctx_used_pmds[0]));

 	}
 	return ret;
 }

 static int
-pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_write_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
 	unsigned int cnum;
@@ -1237,7 +1340,7 @@ pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 	int ret = 0, reg_retval = 0;

 	/* we don't quite support this right now */
-	if (ta != current) return -EINVAL;
+	if (task != current) return -EINVAL;

 	/* 
 	 * Cannot do anything before PMU is enabled 
@@ -1252,7 +1355,6 @@ pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

 		cnum = tmp.reg_num;
-
 		if (!PMD_IS_IMPL(cnum)) {
 			ret = -EINVAL;
 			goto abort_mission;
@@ -1266,6 +1368,10 @@ pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 			ctx->ctx_soft_pmds[cnum].short_reset = tmp.reg_short_reset;

 		}
+		/*
+		 * execute write checker, if any
+		 */
+		if (PMD_WR_FUNC(cnum)) ret = PMD_WR_FUNC(cnum)(task, cnum, &tmp.reg_value);
 abort_mission:
 		if (ret == -EINVAL) reg_retval = PFM_REG_RETFL_EINVAL;

@@ -1282,21 +1388,22 @@ pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 		 */
 		if (ret != 0) {
 			DBprintk(("[%d] pmc[%u]=0x%lx error %d\n",
-				  ta->pid, cnum, tmp.reg_value, reg_retval));
+				  task->pid, cnum, tmp.reg_value, reg_retval));
 			break;
 		}

 		/* keep track of what we use */
-		CTX_USED_PMD(ctx, cnum);
+		CTX_USED_PMD(ctx, pmu_conf.pmd_desc[(cnum)].dep_pmd[0]);

 		/* writes to unimplemented part is ignored, so this is safe */
-		ia64_set_pmd(cnum, tmp.reg_value);
+		ia64_set_pmd(cnum, tmp.reg_value & pmu_conf.perf_ovfl_val);

 		/* to go away */
 		ia64_srlz_d();
+
 		DBprintk(("[%d] pmd[%u]: soft_pmd=0x%lx  short_reset=0x%lx "
 			  "long_reset=0x%lx hw_pmd=%lx notify=%c used_pmds=0x%lx reset_pmds=0x%lx\n",
-				ta->pid, cnum,
+				task->pid, cnum,
 				ctx->ctx_soft_pmds[cnum].val,
 				ctx->ctx_soft_pmds[cnum].short_reset,
 				ctx->ctx_soft_pmds[cnum].long_reset,
@@ -1309,12 +1416,13 @@ pfm_write_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 }

 static int
-pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
+pfm_read_pmds(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
 {
-	struct thread_struct *th = &ta->thread;
+	struct thread_struct *th = &task->thread;
 	unsigned long val=0;
 	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
-	int i;
+	unsigned int cnum;
+	int i, ret = 0;

 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;

@@ -1327,14 +1435,25 @@ pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,

 	/* XXX: ctx locking may be required here */

-	DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), ta->pid));
+	DBprintk(("ctx_last_cpu=%d for [%d]\n", atomic_read(&ctx->ctx_last_cpu), task->pid));

 	for (i = 0; i < count; i++, req++) {
 		unsigned long reg_val = ~0UL, ctx_val = ~0UL;

 		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;

-		if (!PMD_IS_IMPL(tmp.reg_num)) goto abort_mission;
+		cnum = tmp.reg_num;
+
+		if (!PMD_IS_IMPL(cnum)) goto abort_mission;
+		/*
+		 * we can only read the register that we use. That includes
+		 * the one we explicitely initialize AND the one we want included
+		 * in the sampling buffer (smpl_regs).
+		 *
+		 * Having this restriction allows optimization in the ctxsw routine
+		 * without compromising security (leaks)
+		 */
+		if (!CTX_IS_USED_PMD(ctx, cnum)) goto abort_mission;

 		/*
 		 * If the task is not the current one, then we check if the
@@ -1343,8 +1462,8 @@ pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 		 */
 		if (atomic_read(&ctx->ctx_last_cpu) == smp_processor_id()){
 			ia64_srlz_d();
-			val = reg_val = ia64_get_pmd(tmp.reg_num);
-			DBprintk(("reading pmd[%u]=0x%lx from hw\n", tmp.reg_num, val));
+			val = reg_val = ia64_get_pmd(cnum);
+			DBprintk(("reading pmd[%u]=0x%lx from hw\n", cnum, val));
 		} else {
 #ifdef CONFIG_SMP
 			int cpu;
@@ -1360,30 +1479,38 @@ pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 			 */
 			cpu = atomic_read(&ctx->ctx_last_cpu);
 			if (cpu != -1) {
-				DBprintk(("must fetch on CPU%d for [%d]\n", cpu, ta->pid));
-				pfm_fetch_regs(cpu, ta, ctx);
+				DBprintk(("must fetch on CPU%d for [%d]\n", cpu, task->pid));
+				pfm_fetch_regs(cpu, task, ctx);
 			}
 #endif
 			/* context has been saved */
-			val = reg_val = th->pmd[tmp.reg_num];
+			val = reg_val = th->pmd[cnum];
 		}
-		if (PMD_IS_COUNTING(tmp.reg_num)) {
+		if (PMD_IS_COUNTING(cnum)) {
 			/*
 			 * XXX: need to check for overflow
 			 */

 			val &= pmu_conf.perf_ovfl_val;
-			val += ctx_val = ctx->ctx_soft_pmds[tmp.reg_num].val;
+			val += ctx_val = ctx->ctx_soft_pmds[cnum].val;
 		} else {
-
-			val = reg_val = ia64_get_pmd(tmp.reg_num);
+			val = reg_val = ia64_get_pmd(cnum);
 		}
-		PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
+
 		tmp.reg_value = val;

-		DBprintk(("read pmd[%u] soft_pmd=0x%lx reg=0x%lx pmc=0x%lx\n", 
-					tmp.reg_num, ctx_val, reg_val, 
-					ia64_get_pmc(tmp.reg_num)));
+		/*
+		 * execute read checker, if any
+		 */
+		if (PMD_RD_FUNC(cnum)) {
+			ret = PMD_RD_FUNC(cnum)(task, cnum, &tmp.reg_value);
+		}
+
+		PFM_REG_RETFLAG_SET(tmp.reg_flags, ret);
+
+		DBprintk(("read pmd[%u] ret=%d soft_pmd=0x%lx reg=0x%lx pmc=0x%lx\n", 
+					cnum, ret, ctx_val, reg_val, 
+					ia64_get_pmc(cnum)));

 		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
 	}
@@ -1391,7 +1518,7 @@ pfm_read_pmds(struct task_struct *ta, pfm_context_t *ctx, void *arg, int count,
 abort_mission:
 	PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
 	/* 
-	 * XXX: if this fails, we stick we the original failure, flag not updated!
+	 * XXX: if this fails, we stick with the original failure, flag not updated!
 	 */
 	copy_to_user(req, &tmp, sizeof(tmp));
 	return -EINVAL;
@@ -1459,6 +1586,7 @@ pfm_use_debug_registers(struct task_struct *task)
 * perfmormance monitoring, so we only decrement the number
 * of "ptraced" debug register users to keep the count up to date
 */
+
 int
 pfm_release_debug_registers(struct task_struct *task)
 {
@@ -1505,13 +1633,6 @@ pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 	 */
 	if (!CTX_IS_ENABLED(ctx)) return -EINVAL;

-#if 0
-	if (ctx->ctx_fl_frozen==0) {
-		printk("task %d without pmu_frozen set\n", task->pid);
-		return -EINVAL;
-	}
-#endif
-
 	if (task == current) {
 		DBprintk(("restarting self %d frozen=%d \n", current->pid, ctx->ctx_fl_frozen));

@@ -1554,7 +1675,6 @@ pfm_restart(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 		up(sem);
 	} else {
 		task->thread.pfm_ovfl_block_reset = 1;
-		set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
 	}
 #if 0
 	/*
@@ -1629,25 +1749,35 @@ pfm_stop(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 				current->pid,
 				ctx->ctx_fl_system, PMU_OWNER(),
 				current));
+
 	/* simply stop monitoring but not the PMU */
 	if (ctx->ctx_fl_system) {

-		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
-
 		/* disable dcr pp */
 		ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);

+		/* stop monitoring */
+		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+
+		ia64_srlz_i();
+
 #ifdef CONFIG_SMP
-		local_cpu_data->pfm_dcr_pp  = 0;
+		this_cpu(pfm_dcr_pp)  = 0;
 #else
 		pfm_tasklist_toggle_pp(0);
 #endif
-
 		ia64_psr(regs)->pp = 0;

 	} else {
+
+		/* stop monitoring */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");

+		ia64_srlz_i();
+
+		/*
+		 * clear user level psr.up
+		 */
 		ia64_psr(regs)->up = 0;
 	}
 	return 0;
@@ -1674,7 +1804,7 @@ pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 		ia64_psr(regs)->up = 0;
 	}
 	/* 
-	 * goes back to default behavior 
+	 * goes back to default behavior: no user level control
 	 * no need to change live psr.sp because useless at the kernel level
 	 */
 	ia64_psr(regs)->sp = 1;
@@ -1686,10 +1816,8 @@ pfm_disable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 	return 0;
 }

-
-
 static int
-pfm_destroy_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+pfm_context_destroy(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
 	 struct pt_regs *regs)
 {
 	/* we don't quite support this right now */
@@ -1715,15 +1843,14 @@ pfm_destroy_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int
 		ia64_psr(regs)->up = 0;
 	}

-	/* restore security level */
-	ia64_psr(regs)->sp = 1;
-
 skipped_stop:
 	/*
 	 * remove sampling buffer mapping, if any
 	 */
-	if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task);
-
+	if (ctx->ctx_smpl_vaddr) {
+		pfm_remove_smpl_mapping(task);
+		ctx->ctx_smpl_vaddr = 0UL;
+	}
 	/* now free context and related state */
 	pfm_context_exit(task);

@@ -1734,7 +1861,7 @@ pfm_destroy_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int
 * does nothing at the moment
 */
 static int
-pfm_unprotect_context(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+pfm_context_unprotect(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
 	 struct pt_regs *regs)
 {
 	return 0;
@@ -1764,9 +1891,9 @@ pfm_debug(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 {
 	unsigned int mode = *(unsigned int *)arg;

-	pfm_debug_mode = mode == 0 ? 0 : 1;
+	pfm_sysctl.debug = mode == 0 ? 0 : 1;

-	printk("perfmon debugging %s\n", pfm_debug_mode ? "on" : "off");
+	printk("perfmon debugging %s\n", pfm_sysctl.debug ? "on" : "off");

 	return 0;
 }
@@ -1855,15 +1982,17 @@ pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, stru
 			memset(task->thread.ibr, 0, sizeof(task->thread.ibr));

 			/*
-			 * clear hardware registers to make sure we don't leak
-			 * information and pick up stale state
+			 * clear hardware registers to make sure we don't
+			 * pick up stale state
 			 */
 			for (i=0; i < pmu_conf.num_ibrs; i++) {
 				ia64_set_ibr(i, 0UL);
 			}
+			ia64_srlz_i();
 			for (i=0; i < pmu_conf.num_dbrs; i++) {
 				ia64_set_dbr(i, 0UL);
 			}
+			ia64_srlz_d();
 		}
 	}

@@ -1924,6 +2053,7 @@ pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, stru
 			CTX_USED_IBR(ctx, rnum);

 			ia64_set_ibr(rnum, dbreg.val);
+			ia64_srlz_i();

 			thread->ibr[rnum] = dbreg.val;

@@ -1932,6 +2062,7 @@ pfm_write_ibr_dbr(int mode, struct task_struct *task, void *arg, int count, stru
 			CTX_USED_DBR(ctx, rnum);

 			ia64_set_dbr(rnum, dbreg.val);
+			ia64_srlz_d();

 			thread->dbr[rnum] = dbreg.val;

@@ -2031,27 +2162,35 @@ pfm_start(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,

 	if (ctx->ctx_fl_system) {
 		
-		/* enable dcr pp */
-		ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
-
 #ifdef CONFIG_SMP
-		local_cpu_data->pfm_dcr_pp  = 1;
+		this_cpu(pfm_dcr_pp)  = 1;
 #else
 		pfm_tasklist_toggle_pp(1);
 #endif
+		/* set user level psr.pp */
 		ia64_psr(regs)->pp = 1;

+		/* start monitoring at kernel level */
 		__asm__ __volatile__ ("ssm psr.pp;;"::: "memory");

+		/* enable dcr pp */
+		ia64_set_dcr(ia64_get_dcr()|IA64_DCR_PP);
+
+		ia64_srlz_i();
+
 	} else {
 		if ((task->thread.flags & IA64_THREAD_PM_VALID) == 0) {
 			printk("perfmon: pfm_start task flag not set for [%d]\n", task->pid);
 			return -EINVAL;
 		}
+		/* set user level psr.up */
 		ia64_psr(regs)->up = 1;
+
+		/* start monitoring at kernel level */
 		__asm__ __volatile__ ("sum psr.up;;"::: "memory");
+
+		ia64_srlz_i();
 	}
-	ia64_srlz_d();

 	return 0;
 }
@@ -2074,11 +2213,13 @@ pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 		ia64_psr(regs)->pp = 0;
 		ia64_psr(regs)->up = 0; /* just to make sure! */

+		/* make sure monitoring is stopped */
 		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+		ia64_srlz_i();

 #ifdef CONFIG_SMP
-		local_cpu_data->pfm_syst_wide = 1;
-		local_cpu_data->pfm_dcr_pp    = 0;
+		this_cpu(pfm_syst_wide) = 1;
+		this_cpu(pfm_dcr_pp)    = 0;
 #endif
 	} else {
 		/*
@@ -2089,21 +2230,21 @@ pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 		ia64_psr(regs)->pp = 0; /* just to make sure! */
 		ia64_psr(regs)->up = 0;

+		/* make sure monitoring is stopped */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");
-		/*
-		 * allow user control (user monitors only)
-		if (task  == ctx->ctx_owner) {
-		 */
-		{
-			DBprintk(("clearing psr.sp for [%d]\n", current->pid));
-			ia64_psr(regs)->sp = 0;
-		}
+		ia64_srlz_i();
+
+		DBprintk(("clearing psr.sp for [%d]\n", current->pid));
+
+		/* allow user level control  */
+		ia64_psr(regs)->sp = 0;
+
+		/* PMU state will be saved/restored on ctxsw */
 		task->thread.flags |= IA64_THREAD_PM_VALID;
 	}

 	SET_PMU_OWNER(task);

-
 	ctx->ctx_flags.state = PFM_CTX_ENABLED;
 	atomic_set(&ctx->ctx_last_cpu, smp_processor_id());

@@ -2114,6 +2255,40 @@ pfm_enable(struct task_struct *task, pfm_context_t *ctx, void *arg, int count,
 	return 0;
 }

+static int
+pfm_get_pmc_reset(struct task_struct *task, pfm_context_t *ctx, void *arg, int count, 
+	   struct pt_regs *regs)
+{
+	pfarg_reg_t tmp, *req = (pfarg_reg_t *)arg;
+	unsigned int cnum;
+	int i;
+
+	for (i = 0; i < count; i++, req++) {
+
+		if (copy_from_user(&tmp, req, sizeof(tmp))) return -EFAULT;
+
+		cnum = tmp.reg_num;
+
+		if (!PMC_IS_IMPL(cnum)) goto abort_mission;
+
+		tmp.reg_value = reset_pmcs[cnum];
+
+		PFM_REG_RETFLAG_SET(tmp.reg_flags, 0);
+
+		DBprintk(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, tmp.reg_value)); 
+
+		if (copy_to_user(req, &tmp, sizeof(tmp))) return -EFAULT;
+	}
+	return 0;
+abort_mission:
+	PFM_REG_RETFLAG_SET(tmp.reg_flags, PFM_REG_RETFL_EINVAL);
+	/* 
+	 * XXX: if this fails, we stick with the original failure, flag not updated!
+	 */
+	copy_to_user(req, &tmp, sizeof(tmp));
+	return -EINVAL;
+}
+
 /*
 * functions MUST be listed in the increasing order of their index (see permfon.h)
 */
@@ -2121,19 +2296,19 @@ static pfm_cmd_desc_t pfm_cmd_tab[]={
 /* 0  */{ NULL, 0, 0, 0}, /* not used */
 /* 1  */{ pfm_write_pmcs, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
 /* 2  */{ pfm_write_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
-/* 3  */{ pfm_read_pmds, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
+/* 3  */{ pfm_read_pmds,PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)}, 
 /* 4  */{ pfm_stop, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 5  */{ pfm_start, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 6  */{ pfm_enable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 7  */{ pfm_disable, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 8  */{ pfm_create_context, PFM_CMD_ARG_READ, 1, sizeof(pfarg_context_t)},
-/* 9  */{ pfm_destroy_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
+/* 8  */{ pfm_context_create, PFM_CMD_PID|PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, 1, sizeof(pfarg_context_t)},
+/* 9  */{ pfm_context_destroy, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 10 */{ pfm_restart, PFM_CMD_PID|PFM_CMD_CTX|PFM_CMD_NOCHK, 0, 0},
 /* 11 */{ pfm_protect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
 /* 12 */{ pfm_get_features, PFM_CMD_ARG_WRITE, 0, 0},
 /* 13 */{ pfm_debug, 0, 1, sizeof(unsigned int)},
-/* 14 */{ pfm_unprotect_context, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
-/* 15 */{ NULL, 0, 0, 0}, /* not used */
+/* 14 */{ pfm_context_unprotect, PFM_CMD_PID|PFM_CMD_CTX, 0, 0},
+/* 15 */{ pfm_get_pmc_reset, PFM_CMD_ARG_READ|PFM_CMD_ARG_WRITE, PFM_CMD_ARG_MANY, sizeof(pfarg_reg_t)},
 /* 16 */{ NULL, 0, 0, 0}, /* not used */
 /* 17 */{ NULL, 0, 0, 0}, /* not used */
 /* 18 */{ NULL, 0, 0, 0}, /* not used */
@@ -2167,19 +2342,10 @@ check_task_state(struct task_struct *task)
 	 * after the task is marked as STOPPED but before pfm_save_regs()
 	 * is completed.
 	 */
-	for (;;) {
-
-		task_lock(task);
-		if (1 /*XXX !task_has_cpu(task)*/) break;
-		task_unlock(task);
-
-		do {
-			if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
-			barrier();
-			cpu_relax();
-		} while (0 /*task_has_cpu(task)*/);
-	}
-	task_unlock(task);
+	if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) return -EBUSY;
+	DBprintk(("before wait_task_inactive [%d] state %ld\n", task->pid, task->state));
+	wait_task_inactive(task);
+	DBprintk(("after wait_task_inactive [%d] state %ld\n", task->pid, task->state));
 #else
 	if (task->state != TASK_ZOMBIE && task->state != TASK_STOPPED) {
 		DBprintk(("warning [%d] not in stable state %ld\n", task->pid, task->state));
@@ -2273,7 +2439,7 @@ sys_perfmonctl (pid_t pid, int cmd, void *arg, int count, long arg5, long arg6,
 }

 void
-pfm_ovfl_block_reset (void)
+pfm_ovfl_block_reset(void)
 {
 	struct thread_struct *th = &current->thread;
 	pfm_context_t *ctx = current->thread.pfm_context;
@@ -2353,18 +2519,17 @@ pfm_record_sample(struct task_struct *task, pfm_context_t *ctx, unsigned long ov
 	int j;


-pfm_recorded_samples_count++;
 	idx = ia64_fetch_and_add(1, &psb->psb_index);
-	DBprintk(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));
+	DBprintk_ovfl(("recording index=%ld entries=%ld\n", idx-1, psb->psb_entries));

 	/*
-	* XXX: there is a small chance that we could run out on index before resetting
-	* but index is unsigned long, so it will take some time.....
-	* We use > instead of == because fetch_and_add() is off by one (see below)
-	*
-	* This case can happen in non-blocking mode or with multiple processes.
-	* For non-blocking, we need to reload and continue.
-	 */
+	 * XXX: there is a small chance that we could run out on index before resetting
+	 * but index is unsigned long, so it will take some time.....
+	 * We use > instead of == because fetch_and_add() is off by one (see below)
+	 *
+	 * This case can happen in non-blocking mode or with multiple processes.
+	 * For non-blocking, we need to reload and continue.
+ 	 */
 	if (idx > psb->psb_entries) return 0;

 	/* first entry is really entry 0, not 1 caused by fetch_and_add */
@@ -2375,7 +2540,7 @@ pfm_recorded_samples_count++;
 	/*
 	 * initialize entry header
 	 */
-	h->pid  = task->pid;
+	h->pid  = current->pid;
 	h->cpu  = smp_processor_id();
 	h->rate = 0; /* XXX: add the sampling rate used here */
 	h->ip   = regs ? regs->cr_iip : 0x0;	/* where did the fault happened */
@@ -2403,24 +2568,27 @@ pfm_recorded_samples_count++;
 		} else {
 			*e = ia64_get_pmd(j); /* slow */
 		}
-		DBprintk(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
+		DBprintk_ovfl(("e=%p pmd%d =0x%lx\n", (void *)e, j, *e));
 		e++;
 	}
+	pfm_stats.pfm_recorded_samples_count++;
+
 	/*
 	 * make the new entry visible to user, needs to be atomic
 	 */
 	ia64_fetch_and_add(1, &psb->psb_hdr->hdr_count);

-	DBprintk(("index=%ld entries=%ld hdr_count=%ld\n", 
+	DBprintk_ovfl(("index=%ld entries=%ld hdr_count=%ld\n", 
 				idx, psb->psb_entries, psb->psb_hdr->hdr_count));
 	/* 
 	 * sampling buffer full ? 
 	 */
 	if (idx == (psb->psb_entries-1)) {
-		DBprintk(("sampling buffer full\n"));
+		DBprintk_ovfl(("sampling buffer full\n"));
 		/*
 		 * XXX: must reset buffer in blocking mode and lost notified
 		 */
+		pfm_stats.pfm_full_smpl_buffer_count++;
 		return 1;
 	}
 	return 0;
@@ -2433,15 +2601,13 @@ pfm_recorded_samples_count++;
 *	new value of pmc[0]. if 0x0 then unfreeze, else keep frozen
 */
 static unsigned long
-pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
+pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
 {
 	unsigned long mask;
 	struct thread_struct *t;
-	pfm_context_t *ctx;
 	unsigned long old_val;
 	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL;
 	int i;
-	int my_cpu = smp_processor_id();
 	int ret = 1;
 	struct siginfo si;
 	/*
@@ -2457,18 +2623,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 	 * valid one, i.e. the one that caused the interrupt.
 	 */

-	if (task == NULL) {
-		DBprintk(("owners[%d]=NULL\n", my_cpu));
-		return 0x1;
-	}
 	t   = &task->thread;
-	ctx = task->thread.pfm_context;
-
-	if (!ctx) {
-		printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", 
-			task->pid);
-		return 0;
-	}

 	/*
 	 * XXX: debug test
@@ -2490,12 +2645,12 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)

 	mask = pmc0 >> PMU_FIRST_COUNTER;

-	DBprintk(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
-		  " mode used_pmds=0x%lx save_pmcs=0x%lx reload_pmcs=0x%lx\n", 
+	DBprintk_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s"
+		  " mode used_pmds=0x%lx used_pmcs=0x%lx reload_pmcs=0x%lx\n", 
 			pmc0, task->pid, (regs ? regs->cr_iip : 0), 
 			CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
 			ctx->ctx_used_pmds[0],
-			ctx->ctx_saved_pmcs[0],
+			ctx->ctx_used_pmcs[0],
 			ctx->ctx_reload_pmcs[0]));

 	/*
@@ -2506,7 +2661,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 		/* skip pmd which did not overflow */
 		if ((mask & 0x1) == 0) continue;

-		DBprintk(("PMD[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n", 
+		DBprintk_ovfl(("pmd[%d] overflowed hw_pmd=0x%lx soft_pmd=0x%lx\n", 
 			  i, ia64_get_pmd(i), ctx->ctx_soft_pmds[i].val));

 		/*
@@ -2518,8 +2673,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 		old_val = ctx->ctx_soft_pmds[i].val;
 		ctx->ctx_soft_pmds[i].val = 1 + pmu_conf.perf_ovfl_val + pfm_read_soft_counter(ctx, i);

-
-		DBprintk(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", 
+		DBprintk_ovfl(("soft_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx\n", 
 			  i, ctx->ctx_soft_pmds[i].val, old_val, 
 			  ia64_get_pmd(i) & pmu_conf.perf_ovfl_val));

@@ -2536,7 +2690,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)

 			ovfl_pmds |= 1UL << i;

-			DBprintk(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds));
+			DBprintk_ovfl(("soft_pmd[%d] overflowed flags=0x%x, ovfl=0x%lx\n", i, ctx->ctx_soft_pmds[i].flags, ovfl_pmds));

 			if (PMC_OVFL_NOTIFY(ctx, i)) {
 				ovfl_notify |= 1UL << i;
@@ -2575,7 +2729,8 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 	 * No overflow requiring a user level notification
 	 */
 	if (ovfl_notify == 0UL) {
-		pfm_reset_regs(ctx, &ovfl_pmds, PFM_RELOAD_SHORT_RESET);
+		if (ovfl_pmds) 
+			pfm_reset_regs(ctx, &ovfl_pmds, PFM_RELOAD_SHORT_RESET);
 		return 0x0;
 	}

@@ -2650,7 +2805,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 	 	 * necessarily go to the signal handler (if any) when it goes back to
 	 	 * user mode.
 	 	 */
-		DBprintk(("[%d] sending notification to [%d]\n", 
+		DBprintk_ovfl(("[%d] sending notification to [%d]\n", 
 			  task->pid, ctx->ctx_notify_task->pid));


@@ -2683,7 +2838,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 		 * before, changing it to NULL will still maintain this invariant.
 		 * Of course, when it is equal to current it cannot change at this point.
 		 */
-		DBprintk(("block=%d notify [%d] current [%d]\n", 
+		DBprintk_ovfl(("block=%d notify [%d] current [%d]\n", 
 			ctx->ctx_fl_block,
 			ctx->ctx_notify_task ? ctx->ctx_notify_task->pid: -1, 
 			current->pid ));
@@ -2694,7 +2849,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 	} else {
 lost_notify: /* XXX: more to do here, to convert to non-blocking (reset values) */

-		DBprintk(("notification task has disappeared !\n"));
+		DBprintk_ovfl(("notification task has disappeared !\n"));
 		/*
 		 * for a non-blocking context, we make sure we do not fall into the 
 		 * pfm_overflow_notify() trap. Also in the case of a blocking context with lost 
@@ -2716,7 +2871,7 @@ pfm_overflow_handler(struct task_struct *task, u64 pmc0, struct pt_regs *regs)
 	 */
 	ctx->ctx_fl_frozen = 1;

-	DBprintk(("reload pmc0=0x%x must_block=%ld\n",
+	DBprintk_ovfl(("return pmc0=0x%x must_block=%ld\n",
 				ctx->ctx_fl_frozen ? 0x1 : 0x0, t->pfm_ovfl_block_reset));

 	return ctx->ctx_fl_frozen ? 0x1 : 0x0;
@@ -2727,8 +2882,9 @@ perfmon_interrupt (int irq, void *arg, struct pt_regs *regs)
 {
 	u64 pmc0;
 	struct task_struct *task;
+	pfm_context_t *ctx;

-	pfm_ovfl_intr_count++;
+	pfm_stats.pfm_ovfl_intr_count++;

 	/* 
 	 * srlz.d done before arriving here
@@ -2742,24 +2898,54 @@ perfmon_interrupt (int irq, void *arg, struct pt_regs *regs)
 	 * assumes : if any PM[0].bit[63-1] is set, then PMC[0].fr = 1
 	 */
 	if ((pmc0 & ~0x1UL)!=0UL && (task=PMU_OWNER())!= NULL) {
-
 		/* 
-		 * assumes, PMC[0].fr = 1 at this point 
-		 *
-		 * XXX: change protype to pass &pmc0
+		 * we assume that pmc0.fr is always set here
 		 */
-		pmc0 = pfm_overflow_handler(task, pmc0, regs);
+		ctx = task->thread.pfm_context;

-		/* we never explicitely freeze PMU here */
-		if (pmc0 == 0) {
-			ia64_set_pmc(0, 0);
-			ia64_srlz_d();
+		/* sanity check */
+		if (!ctx) {
+			printk("perfmon: Spurious overflow interrupt: process %d has no PFM context\n", 
+				task->pid);
+			return;
 		}
+#ifdef CONFIG_SMP
+		/*
+		 * Because an IPI has higher priority than the PMU overflow interrupt, it is 
+		 * possible that the handler be interrupted by a request from another CPU to fetch 
+		 * the PMU state of the currently active context. The task may have just been 
+		 * migrated to another CPU which is trying to restore the context. If there was
+		 * a pending overflow interrupt when the task left this CPU, it is possible for
+		 * the handler to get interrupt by the IPI. In which case, we fetch request
+		 * MUST be postponed until the interrupt handler is done. The ctx_is_busy
+		 * flag indicates such a condition. The other CPU must busy wait until it's cleared.
+		 */
+		atomic_set(&ctx->ctx_is_busy, 1);
+#endif
+
+		/* 
+		 * assume PMC[0].fr = 1 at this point 
+		 */
+		pmc0 = pfm_overflow_handler(task, ctx, pmc0, regs);
+
+		/*
+		 * We always clear the overflow status bits and either unfreeze
+		 * or keep the PMU frozen.
+		 */
+		ia64_set_pmc(0, pmc0);
+		ia64_srlz_d();
+
+#ifdef CONFIG_SMP
+		/*
+		 * announce that we are doing with the context
+		 */
+		atomic_set(&ctx->ctx_is_busy, 0);
+#endif
 	} else {
-		pfm_spurious_ovfl_intr_count++;
+		pfm_stats.pfm_spurious_ovfl_intr_count++;

-		DBprintk(("perfmon: Spurious PMU overflow interrupt on CPU%d: pmc0=0x%lx owner=%p\n", 
-			smp_processor_id(), pmc0, (void *)PMU_OWNER()));
+		printk("perfmon: Spurious PMU overflow interrupt on CPU%d: pmc0=0x%lx owner=%p\n", 
+			smp_processor_id(), pmc0, (void *)PMU_OWNER());
 	}
 }

@@ -2773,27 +2959,30 @@ perfmon_proc_info(char *page)
 #define cpu_is_online(i)        1
 #endif
 	char *p = page;
-	u64 pmc0 = ia64_get_pmc(0);
 	int i;

-	p += sprintf(p, "perfmon enabled: %s\n", pmu_conf.pfm_is_disabled ? "No": "Yes");
-
-	p += sprintf(p, "monitors_pmcs0]=0x%lx\n", pmu_conf.monitor_pmcs[0]);
-	p += sprintf(p, "counter_pmcds[0]=0x%lx\n", pmu_conf.counter_pmds[0]);
-	p += sprintf(p, "overflow interrupts=%lu\n", pfm_ovfl_intr_count);
-	p += sprintf(p, "spurious overflow interrupts=%lu\n", pfm_spurious_ovfl_intr_count);
-	p += sprintf(p, "recorded samples=%lu\n", pfm_recorded_samples_count);
-
-	p += sprintf(p, "CPU%d.pmc[0]=%lx\nPerfmon debug: %s\n", 
-			smp_processor_id(), pmc0, pfm_debug_mode ? "On" : "Off");
+	p += sprintf(p, "enabled          : %s\n", pmu_conf.pfm_is_disabled ? "No": "Yes");
+	p += sprintf(p, "fastctxsw        : %s\n", pfm_sysctl.fastctxsw > 0 ? "Yes": "No");
+	p += sprintf(p, "ovfl_mask        : 0x%lx\n", pmu_conf.perf_ovfl_val);
+	p += sprintf(p, "overflow intrs   : %lu\n", pfm_stats.pfm_ovfl_intr_count);
+	p += sprintf(p, "spurious intrs   : %lu\n", pfm_stats.pfm_spurious_ovfl_intr_count);
+	p += sprintf(p, "recorded samples : %lu\n", pfm_stats.pfm_recorded_samples_count);
+	p += sprintf(p, "smpl buffer full : %lu\n", pfm_stats.pfm_full_smpl_buffer_count);

 #ifdef CONFIG_SMP
-	p += sprintf(p, "CPU%d cpu_data.pfm_syst_wide=%d cpu_data.dcr_pp=%d\n", 
-			smp_processor_id(), local_cpu_data->pfm_syst_wide, local_cpu_data->pfm_dcr_pp);
+	p += sprintf(p, "CPU%d syst_wide   : %d\n"
+			"CPU%d dcr_pp      : %d\n", 
+			smp_processor_id(), 
+			this_cpu(pfm_syst_wide), 
+			smp_processor_id(), 
+			this_cpu(pfm_dcr_pp));
 #endif

 	LOCK_PFS();
-	p += sprintf(p, "proc_sessions=%lu\nsys_sessions=%lu\nsys_use_dbregs=%lu\nptrace_use_dbregs=%lu\n", 
+	p += sprintf(p, "proc_sessions    : %lu\n"
+			"sys_sessions     : %lu\n"
+			"sys_use_dbregs   : %lu\n"
+			"ptrace_use_dbregs: %lu\n", 
 			pfm_sessions.pfs_task_sessions, 
 			pfm_sessions.pfs_sys_sessions,
 			pfm_sessions.pfs_sys_use_dbregs,
@@ -2803,12 +2992,28 @@ perfmon_proc_info(char *page)

 	for(i=0; i < NR_CPUS; i++) {
 		if (cpu_is_online(i)) {
-			p += sprintf(p, "CPU%d.pmu_owner: %-6d\n",
+			p += sprintf(p, "CPU%d owner : %-6d\n",
 					i, 
 					pmu_owners[i].owner ? pmu_owners[i].owner->pid: -1);
 		}
 	}

+	for(i=0; pmd_desc[i].type != PFM_REG_NONE; i++) {
+		p += sprintf(p, "PMD%-2d: %d 0x%lx 0x%lx\n", 
+				i,
+				pmd_desc[i].type, 
+				pmd_desc[i].dep_pmd[0], 
+				pmd_desc[i].dep_pmc[0]); 
+	}
+
+	for(i=0; pmc_desc[i].type != PFM_REG_NONE; i++) {
+		p += sprintf(p, "PMC%-2d: %d 0x%lx 0x%lx\n", 
+				i, 
+				pmc_desc[i].type, 
+				pmc_desc[i].dep_pmd[0], 
+				pmc_desc[i].dep_pmc[0]); 
+	}
+
 	return p - page;
 }

@@ -2840,7 +3045,7 @@ pfm_syst_wide_update_task(struct task_struct *task, int mode)
 	/*
 	 * propagate the value of the dcr_pp bit to the psr
 	 */
-	ia64_psr(regs)->pp = mode ? local_cpu_data->pfm_dcr_pp : 0;
+	ia64_psr(regs)->pp = mode ? this_cpu(pfm_dcr_pp) : 0;
 }
 #endif

@@ -2867,6 +3072,7 @@ pfm_save_regs (struct task_struct *task)
 	 * It will be restored from ipsr when going back to user level
 	 */
 	__asm__ __volatile__ ("rum psr.up;;"::: "memory");
+	ia64_srlz_i();

 	ctx->ctx_saved_psr = psr;

@@ -2922,13 +3128,9 @@ pfm_lazy_save_regs (struct task_struct *task)
 	for (i=0; mask; i++, mask>>=1) {
 		if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
 	}
-	/*
-	 * XXX: simplify to pmc0 only
-	 */
-	mask = ctx->ctx_saved_pmcs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
-	}
+
+	/* save pmc0 */
+	t->pmc[0] = ia64_get_pmc(0);

 	/* not owned by this CPU */
 	atomic_set(&ctx->ctx_last_cpu, -1);
@@ -2966,6 +3168,12 @@ pfm_handle_fetch_regs(void *info)
 		  PMU_OWNER() ? PMU_OWNER()->pid: -1,
 		  atomic_read(&ctx->ctx_saving_in_progress)));

+	/* must wait until not busy before retrying whole request */
+	if (atomic_read(&ctx->ctx_is_busy)) {
+		arg->retval = 2;
+		return;
+	}
+
 	/* must wait if saving was interrupted */
 	if (atomic_read(&ctx->ctx_saving_in_progress)) {
 		arg->retval = 1;
@@ -2978,9 +3186,9 @@ pfm_handle_fetch_regs(void *info)
 		return;
 	}

-	DBprintk(("saving state for [%d] save_pmcs=0x%lx all_pmcs=0x%lx used_pmds=0x%lx\n", 
+	DBprintk(("saving state for [%d] used_pmcs=0x%lx reload_pmcs=0x%lx used_pmds=0x%lx\n", 
 		arg->task->pid,
-		ctx->ctx_saved_pmcs[0],
+		ctx->ctx_used_pmcs[0],
 		ctx->ctx_reload_pmcs[0],
 		ctx->ctx_used_pmds[0]));

@@ -2993,17 +3201,15 @@ pfm_handle_fetch_regs(void *info)

 	/*
 	 * XXX needs further optimization.
-	 * Also must take holes into account
 	 */
 	mask = ctx->ctx_used_pmds[0];
 	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmd[i] =ia64_get_pmd(i);
-	}
-	
-	mask = ctx->ctx_saved_pmcs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) t->pmc[i] = ia64_get_pmc(i);
+		if (mask & 0x1) t->pmd[i] = ia64_get_pmd(i);
 	}
+
+	/* save pmc0 */
+	t->pmc[0] = ia64_get_pmc(0);
+
 	/* not owned by this CPU */
 	atomic_set(&ctx->ctx_last_cpu, -1);

@@ -3032,11 +3238,17 @@ pfm_fetch_regs(int cpu, struct task_struct *task, pfm_context_t *ctx)
 	arg.task   = task;
 	arg.retval = -1;

+	if (atomic_read(&ctx->ctx_is_busy)) {
+must_wait_busy:
+		while (atomic_read(&ctx->ctx_is_busy));
+	}
+
 	if (atomic_read(&ctx->ctx_saving_in_progress)) {
 		DBprintk(("no IPI, must wait for [%d] to be saved on [%d]\n", task->pid, cpu));
-
+must_wait_saving:
 		/* busy wait */
 		while (atomic_read(&ctx->ctx_saving_in_progress));
+		DBprintk(("done saving for [%d] on [%d]\n", task->pid, cpu));
 		return;
 	}
 	DBprintk(("calling CPU %d from CPU %d\n", cpu, smp_processor_id()));
@@ -3056,11 +3268,8 @@ pfm_fetch_regs(int cpu, struct task_struct *task, pfm_context_t *ctx)
 	 * This is the case, where we interrupted the saving which started just at the time we sent the
 	 * IPI.
 	 */
-	if (arg.retval == 1) {
-		DBprintk(("must wait for [%d] to be saved on [%d]\n", task->pid, cpu));
-		while (atomic_read(&ctx->ctx_saving_in_progress));
-		DBprintk(("done saving for [%d] on [%d]\n", task->pid, cpu));
-	}
+	if (arg.retval == 1) goto must_wait_saving;
+	if (arg.retval == 2) goto must_wait_busy;
 }
 #endif /* CONFIG_SMP */

@@ -3114,55 +3323,53 @@ pfm_load_regs (struct task_struct *task)
 		pfm_fetch_regs(cpu, task, ctx);
 	}
 #endif
-	t   = &task->thread;
+	t = &task->thread;

 	/*
-	 * XXX: will be replaced by assembly routine
-	 * We clear all unused PMDs to avoid leaking information
+	 * To avoid leaking information to the user level when psr.sp=0,
+	 * we must reload ALL implemented pmds (even the ones we don't use).
+	 * In the kernel we only allow PFM_READ_PMDS on registers which
+	 * we initialized or requested (sampling) so there is no risk there.
+	 *
+	 * As an optimization, we will only reload the PMD that we use when 
+	 * the context is in protected mode, i.e. psr.sp=1 because then there
+	 * is no leak possible.
 	 */
-	mask = ctx->ctx_used_pmds[0];
+	mask = pfm_sysctl.fastctxsw || ctx->ctx_fl_protected ?  ctx->ctx_used_pmds[0] : ctx->ctx_reload_pmds[0];
 	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
-			ia64_set_pmd(i, t->pmd[i]);
-		else
-			ia64_set_pmd(i, 0UL);
+		if (mask & 0x1) ia64_set_pmd(i, t->pmd[i] & pmu_conf.perf_ovfl_val);
 	}
-	/* XXX: will need to clear all unused pmd, for security */

 	/* 
-	 * skip pmc[0] to avoid side-effects, 
-	 * all PMCs are systematically reloaded, unsued get default value
-	 * to avoid picking up stale configuration
+	 * PMC0 is never set in the mask because it is always restored
+	 * separately.  
+	 *
+	 * ALL PMCs are systematically reloaded, unused registers
+	 * get their default (PAL reset) values to avoid picking up 
+	 * stale configuration.
 	 */	
-	mask = ctx->ctx_reload_pmcs[0]>>1;
-	for (i=1; mask; i++, mask>>=1) {
+	mask = ctx->ctx_reload_pmcs[0];
+	for (i=0; mask; i++, mask>>=1) {
 		if (mask & 0x1) ia64_set_pmc(i, t->pmc[i]);
 	}

 	/*
-	 * restore debug registers when used for range restrictions.
-	 * We must restore the unused registers to avoid picking up
-	 * stale information.
+	 * we restore ALL the debug registers to avoid picking up 
+	 * stale state.
 	 */
-	mask = ctx->ctx_used_ibrs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
+	if (ctx->ctx_fl_using_dbreg) {
+		for (i=0; i < pmu_conf.num_ibrs; i++) {
 			ia64_set_ibr(i, t->ibr[i]);
-		else
-			ia64_set_ibr(i, 0UL);
-	}
-
-	mask = ctx->ctx_used_dbrs[0];
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) 
+		}
+		ia64_srlz_i();
+		for (i=0; i < pmu_conf.num_dbrs; i++) {
 			ia64_set_dbr(i, t->dbr[i]);
-		else
-			ia64_set_dbr(i, 0UL);
+		}
 	}
+	ia64_srlz_d();

 	if (t->pmc[0] & ~0x1) {
-		ia64_srlz_d();
-		pfm_overflow_handler(task, t->pmc[0], NULL);
+		pfm_overflow_handler(task, ctx, t->pmc[0], NULL);
 	}

 	/*
@@ -3215,7 +3422,7 @@ ia64_reset_pmu(struct task_struct *task)
 			 * When restoring context, we must restore ALL pmcs, even the ones 
 			 * that the task does not use to avoid leaks and possibly corruption
 			 * of the sesion because of configuration conflicts. So here, we 
-			 * initializaed the table used in the context switch restore routine.
+			 * initialize the entire set used in the context switch restore routine.
 	 		 */
 			t->pmc[i] = reset_pmcs[i];
 			DBprintk((" pmc[%d]=0x%lx\n", i, reset_pmcs[i]));
@@ -3224,39 +3431,61 @@ ia64_reset_pmu(struct task_struct *task)
 	}
 	/*
 	 * clear reset values for PMD. 
-	 * XX: good up to 64 PMDS. Suppose that zero is a valid value.
+	 * XXX: good up to 64 PMDS. Suppose that zero is a valid value.
 	 */
 	mask = pmu_conf.impl_regs[4];
 	for(i=0; mask; mask>>=1, i++) {
 		if (mask & 0x1) ia64_set_pmd(i, 0UL);
+		t->pmd[i] = 0UL;
 	}

 	/*
-	 * On context switched restore, we must restore ALL pmc even
+	 * On context switched restore, we must restore ALL pmc and ALL pmd even
 	 * when they are not actively used by the task. In UP, the incoming process 
-	 * may otherwise pick up left over PMC state from the previous process.
+	 * may otherwise pick up left over PMC, PMD state from the previous process.
 	 * As opposed to PMD, stale PMC can cause harm to the incoming
 	 * process because they may change what is being measured. 
 	 * Therefore, we must systematically reinstall the entire
 	 * PMC state. In SMP, the same thing is possible on the 
-	 * same CPU but also on between 2 CPUs.
+	 * same CPU but also on between 2 CPUs. 
+	 *
+	 * The problem with PMD is information leaking especially
+	 * to user level when psr.sp=0
 	 *
 	 * There is unfortunately no easy way to avoid this problem
-	 * on either UP or SMP. This definitively slows down the 
-	 * pfm_load_regs(). 
+	 * on either UP or SMP. This definitively slows down the
+	 * pfm_load_regs() function. 
 	 */
 	
 	 /*
 	  * We must include all the PMC in this mask to make sure we don't
-	  * see any side effect of the stale state, such as opcode matching
+	  * see any side effect of a stale state, such as opcode matching
 	  * or range restrictions, for instance.
+	  *
+	  * We never directly restore PMC0 so we do not include it in the mask.
 	  */
-	ctx->ctx_reload_pmcs[0] = pmu_conf.impl_regs[0];
+	ctx->ctx_reload_pmcs[0] = pmu_conf.impl_regs[0] & ~0x1;
+	/*
+	 * We must include all the PMD in this mask to avoid picking
+	 * up stale value and leak information, especially directly
+	 * at the user level when psr.sp=0
+	 */
+	ctx->ctx_reload_pmds[0] = pmu_conf.impl_regs[4];
+
+	/* 
+	 * Keep track of the pmds we want to sample
+	 * XXX: may be we don't need to save/restore the DEAR/IEAR pmds
+	 * but we do need the BTB for sure. This is because of a hardware
+	 * buffer of 1 only for non-BTB pmds.
+	 *
+	 * We ignore the unimplemented pmds specified by the user
+	 */
+	ctx->ctx_used_pmds[0] = ctx->ctx_smpl_regs[0] & pmu_conf.impl_regs[4];
+	ctx->ctx_used_pmcs[0] = 1; /* always save/restore PMC[0] */

 	/*
 	 * useful in case of re-enable after disable
 	 */
-	ctx->ctx_used_pmds[0] = 0UL;
 	ctx->ctx_used_ibrs[0] = 0UL;
 	ctx->ctx_used_dbrs[0] = 0UL;

@@ -3278,7 +3507,7 @@ pfm_flush_regs (struct task_struct *task)
 {
 	pfm_context_t *ctx;
 	u64 pmc0;
-	unsigned long mask, mask2, val;
+	unsigned long mask2, val;
 	int i;

 	ctx = task->thread.pfm_context;
@@ -3300,22 +3529,28 @@ pfm_flush_regs (struct task_struct *task)
 	 */
 	if (ctx->ctx_fl_system) {

-		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");

 		/* disable dcr pp */
 		ia64_set_dcr(ia64_get_dcr() & ~IA64_DCR_PP);

+		/* stop monitoring */
+		__asm__ __volatile__ ("rsm psr.pp;;"::: "memory");
+
+		ia64_srlz_i();
+
 #ifdef CONFIG_SMP
-		local_cpu_data->pfm_syst_wide = 0;
-		local_cpu_data->pfm_dcr_pp    = 0;
+		this_cpu(pfm_syst_wide) = 0;
+		this_cpu(pfm_dcr_pp)    = 0;
 #else
 		pfm_tasklist_toggle_pp(0);
 #endif
-
 	} else  {

+		/* stop monitoring */
 		__asm__ __volatile__ ("rum psr.up;;"::: "memory");

+		ia64_srlz_i();
+
 		/* no more save/restore on ctxsw */
 		current->thread.flags &= ~IA64_THREAD_PM_VALID;
 	}
@@ -3349,7 +3584,7 @@ pfm_flush_regs (struct task_struct *task)
 	ia64_srlz_d();

 	/*
-	 * We don't need to restore psr, because we are on our way out anyway
+	 * We don't need to restore psr, because we are on our way out
 	 */

 	/*
@@ -3365,10 +3600,12 @@ pfm_flush_regs (struct task_struct *task)
 	if (atomic_read(&ctx->ctx_last_cpu) != smp_processor_id()) 
 		printk("perfmon: [%d] last_cpu=%d\n", task->pid, atomic_read(&ctx->ctx_last_cpu));

-	mask  = pmc0 >> PMU_FIRST_COUNTER;
-	mask2 = ctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
-
-	for (i = PMU_FIRST_COUNTER; mask2; i++, mask>>=1, mask2>>=1) {
+	/*
+	 * we save all the used pmds
+	 * we take care of overflows for pmds used as counters
+	 */
+	mask2 = ctx->ctx_used_pmds[0];
+	for (i = 0; mask2; i++, mask2>>=1) {

 		/* skip non used pmds */
 		if ((mask2 & 0x1) == 0) continue;
@@ -3376,7 +3613,6 @@ pfm_flush_regs (struct task_struct *task)
 		val = ia64_get_pmd(i);

 		if (PMD_IS_COUNTING(i)) {
-
 			DBprintk(("[%d] pmd[%d] soft_pmd=0x%lx hw_pmd=0x%lx\n", task->pid, i, ctx->ctx_soft_pmds[i].val, val & pmu_conf.perf_ovfl_val));

 			/* collect latest results */
@@ -3389,15 +3625,19 @@ pfm_flush_regs (struct task_struct *task)
 			 */
 			task->thread.pmd[i] = 0;

-			/* take care of overflow inline */
-			if (mask & 0x1) {
+			/* 
+			 * take care of overflow inline
+			 */
+			if (pmc0 & (1UL << i)) {
 				ctx->ctx_soft_pmds[i].val += 1 + pmu_conf.perf_ovfl_val;
 				DBprintk(("[%d] pmd[%d] overflowed soft_pmd=0x%lx\n",
 					task->pid, i, ctx->ctx_soft_pmds[i].val));
 			}
 		} else {
 			DBprintk(("[%d] pmd[%d] hw_pmd=0x%lx\n", task->pid, i, val));
-			/* not a counter, just save value as is */
+			/* 
+			 * not a counter, just save value as is
+			 */
 			task->thread.pmd[i] = val;
 		}
 	}
@@ -3409,38 +3649,78 @@ pfm_flush_regs (struct task_struct *task)
 }


-
 /*
 * task is the newly created task, pt_regs for new child
 */
 int
 pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 {
-	pfm_context_t *ctx = current->thread.pfm_context;
+	pfm_context_t *ctx;
 	pfm_context_t *nctx;
-	struct thread_struct *th = &task->thread;
+	struct thread_struct *thread;
 	unsigned long m;
 	int i;

+	/*
+	 * the new task was copied from parent and therefore points
+	 * to the parent's context at this point
+	 */
+	ctx    = task->thread.pfm_context;
+	thread = &task->thread;
+
 	/*
 	 * make sure child cannot mess up the monitoring session
 	 */
 	 ia64_psr(regs)->sp = 1;
 	 DBprintk(("enabling psr.sp for [%d]\n", task->pid));

-	 /*
-	  * remove any sampling buffer mapping from child user 
-	  * address space. Must be done for all cases of inheritance.
-	  */
-	 if (ctx->ctx_smpl_vaddr) pfm_remove_smpl_mapping(task);
+
+	/*
+	 * if there was a virtual mapping for the sampling buffer
+	 * the mapping is NOT inherited across fork() (see VM_DONTCOPY), 
+	 * so we don't have to explicitely remove it here. 
+	 *
+	 *
+	 * Part of the clearing of fields is also done in
+	 * copy_thread() because the fiels are outside the
+	 * pfm_context structure and can affect tasks not
+	 * using perfmon.
+	 */
+
+	/* clear pending notification */
+	task->thread.pfm_ovfl_block_reset = 0;
+
+	/*
+	 * clear cpu pinning restriction for child
+	 */
+	if (ctx->ctx_fl_system) {
+		set_cpus_allowed(task, ctx->ctx_saved_cpus_allowed);
+
+	 	DBprintk(("setting cpus_allowed for [%d] to 0x%lx from 0x%lx\n", 
+			task->pid,
+			ctx->ctx_saved_cpus_allowed, 
+			current->cpus_allowed));
+	}

 	/*
 	 * takes care of easiest case first
 	 */
 	if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_NONE) {
+
 		DBprintk(("removing PFM context for [%d]\n", task->pid));
-		task->thread.pfm_context     = NULL;
-		task->thread.pfm_ovfl_block_reset  = 0;
+
+		task->thread.pfm_context = NULL;
+
+		/* 
+		 * we must clear psr.up because the new child does
+		 * not have a context and the PM_VALID flag is cleared
+		 * in copy_thread().
+		 *
+		 * we do not clear psr.pp because it is always
+		 * controlled by the system wide logic and we should
+		 * never be here when system wide is running anyway
+		 */
+	 	ia64_psr(regs)->up = 0;

 		/* copy_thread() clears IA64_THREAD_PM_VALID */
 		return 0;
@@ -3454,69 +3734,82 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)

 	if (CTX_INHERIT_MODE(ctx) == PFM_FL_INHERIT_ONCE) {
 		nctx->ctx_fl_inherit = PFM_FL_INHERIT_NONE;
-		atomic_set(&nctx->ctx_last_cpu, -1);
-
-		/*
-		 * task is not yet visible in the tasklist, so we do 
-		 * not need to lock the newly created context.
-		 * However, we must grab the tasklist_lock to ensure
-		 * that the ctx_owner or ctx_notify_task do not disappear
-		 * while we increment their check counters.
-		 */
-		read_lock(&tasklist_lock);
+		DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));
+	}
+	/*
+	 * task is not yet visible in the tasklist, so we do 
+	 * not need to lock the newly created context.
+	 * However, we must grab the tasklist_lock to ensure
+	 * that the ctx_owner or ctx_notify_task do not disappear
+	 * while we increment their check counters.
+	 */
+	read_lock(&tasklist_lock);

-		if (nctx->ctx_notify_task) 
-			atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);
+	if (nctx->ctx_notify_task) 
+		atomic_inc(&nctx->ctx_notify_task->thread.pfm_notifiers_check);

-		if (nctx->ctx_owner)
-			atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);
+	if (nctx->ctx_owner)
+		atomic_inc(&nctx->ctx_owner->thread.pfm_owners_check);

-		read_unlock(&tasklist_lock);
+	read_unlock(&tasklist_lock);

-		DBprintk(("downgrading to INHERIT_NONE for [%d]\n", task->pid));

-		LOCK_PFS();
-		pfm_sessions.pfs_task_sessions++;
-		UNLOCK_PFS();
-	}
+	LOCK_PFS();
+	pfm_sessions.pfs_task_sessions++;
+	UNLOCK_PFS();

 	/* initialize counters in new context */
-	m = pmu_conf.counter_pmds[0] >> PMU_FIRST_COUNTER;
+	m = nctx->ctx_used_pmds[0] >> PMU_FIRST_COUNTER;
 	for(i = PMU_FIRST_COUNTER ; m ; m>>=1, i++) {
-		if (m & 0x1) {
+		if ((m & 0x1) && pmu_conf.pmd_desc[i].type == PFM_REG_COUNTING) {
 			nctx->ctx_soft_pmds[i].val = nctx->ctx_soft_pmds[i].ival & ~pmu_conf.perf_ovfl_val;
-			th->pmd[i]	      	   = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val;
+			thread->pmd[i]	      	   = nctx->ctx_soft_pmds[i].ival & pmu_conf.perf_ovfl_val;
 		}
+		/* what about the other pmds? zero or keep as is */

 	}
-	/* clear BTB index register */
-	th->pmd[16] = 0;
+	/*
+	 * clear BTB index register
+	 * XXX: CPU-model specific knowledge!
+	 */
+	thread->pmd[16] = 0;

-	/* if sampling then increment number of users of buffer */
-	if (nctx->ctx_psb) {

-		/*
-		 * XXX: nopt very pretty!
-		 */
+	nctx->ctx_fl_frozen    = 0;
+	nctx->ctx_ovfl_regs[0] = 0UL;
+	atomic_set(&nctx->ctx_last_cpu, -1);
+
+	/*
+	 * here nctx->ctx_psb == ctx->ctx_psb
+	 *
+	 * increment reference count to sampling
+	 * buffer, if any. Note that this is independent
+	 * from the virtual mapping. The latter is never
+	 * inherited while the former will be if context
+	 * is setup to something different from PFM_FL_INHERIT_NONE
+	 */
+	if (nctx->ctx_psb) {
 		LOCK_PSB(nctx->ctx_psb);
+
 		nctx->ctx_psb->psb_refcnt++;
+
+	 	DBprintk(("updated smpl @ %p refcnt=%lu psb_flags=0x%x\n", 
+			ctx->ctx_psb->psb_hdr,
+			ctx->ctx_psb->psb_refcnt,
+			ctx->ctx_psb->psb_flags));
+
 		UNLOCK_PSB(nctx->ctx_psb);
+
 		/*
 	 	 * remove any pointer to sampling buffer mapping
 	 	 */
 		nctx->ctx_smpl_vaddr = 0;
 	}

-	nctx->ctx_fl_frozen = 0;
-	nctx->ctx_ovfl_regs[0] = 0UL;
-
 	sema_init(&nctx->ctx_restart_sem, 0); /* reset this semaphore to locked */

-	/* clear pending notification */
-	th->pfm_ovfl_block_reset = 0;
-
 	/* link with new task */
-	th->pfm_context    = nctx;
+	thread->pfm_context = nctx;

 	DBprintk(("nctx=%p for process [%d]\n", (void *)nctx, task->pid));

@@ -3526,7 +3819,7 @@ pfm_inherit(struct task_struct *task, struct pt_regs *regs)
 	 */
 	if (current->thread.flags & IA64_THREAD_PM_VALID) {
 		DBprintk(("setting PM_VALID for [%d]\n", task->pid));
-		th->flags |= IA64_THREAD_PM_VALID;
+		thread->flags |= IA64_THREAD_PM_VALID;
 	}

 	return 0;
@@ -3555,9 +3848,9 @@ pfm_context_exit(struct task_struct *task)

 		LOCK_PSB(psb);

-		DBprintk(("sampling buffer from [%d] @%p size %ld vma_flag=0x%x\n",
+		DBprintk(("sampling buffer from [%d] @%p size %ld refcnt=%lu psb_flags=0x%x\n",
 			task->pid,
-			psb->psb_hdr, psb->psb_size, psb->psb_flags));
+			psb->psb_hdr, psb->psb_size, psb->psb_refcnt, psb->psb_flags));

 		/*
 		 * in the case where we are the last user, we may be able to free
@@ -3580,7 +3873,7 @@ pfm_context_exit(struct task_struct *task)
 			 *
 			 * See pfm_vm_close() and pfm_cleanup_smpl_buf() for more details.
 			 */
-			if ((psb->psb_flags & PFM_PSB_VMA) == 0) {
+			if ((psb->psb_flags & PSB_HAS_VMA) == 0) {

 				DBprintk(("cleaning sampling buffer from [%d] @%p size %ld\n",
 					task->pid,
@@ -3612,7 +3905,7 @@ pfm_context_exit(struct task_struct *task)
 	 * direct pointer to a task structure thereby bypassing the tasklist. 
 	 * We must make sure that, if we have task!= NULL, the target task is still 
 	 * present and is identical to the initial task specified 
-	 * during pfm_create_context(). It may already be detached from the tasklist but 
+	 * during pfm_context_create(). It may already be detached from the tasklist but 
 	 * that's okay. Note that it is okay if we miss the deadline and the task scans 
 	 * the list for nothing, it will affect performance but not correctness. 
 	 * The correctness is ensured by using the ctx_lock which prevents the 
@@ -3761,6 +4054,8 @@ pfm_cleanup_owners(struct task_struct *task)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
+	atomic_set(&task->thread.pfm_owners_check, 0);
 }


@@ -3818,6 +4113,8 @@ pfm_cleanup_notifiers(struct task_struct *task)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
+	atomic_set(&task->thread.pfm_notifiers_check, 0);
 }

 static struct irqaction perfmon_irqaction = {
@@ -3836,6 +4133,12 @@ pfm_pmu_snapshot(void)
 		if (i >= pmu_conf.num_pmcs) break;
 		if (PMC_IS_IMPL(i)) reset_pmcs[i] = ia64_get_pmc(i);
 	}
+#ifdef CONFIG_MCKINLEY
+	/*
+	 * set the 'stupid' enable bit to power the PMU!
+	 */
+	reset_pmcs[4] |= 1UL << 23;
+#endif
 }

 /*
@@ -3903,23 +4206,12 @@ perfmon_init (void)
 	 */
 	pfm_pmu_snapshot();

-	/* 
-	 * list the pmc registers used to control monitors 
-	 * XXX: unfortunately this information is not provided by PAL
-	 *
-	 * We start with the architected minimum and then refine for each CPU model
-	 */
-	pmu_conf.monitor_pmcs[0] = PMM(4)|PMM(5)|PMM(6)|PMM(7);
-
 	/*
-	 * architected counters
+	 * setup the register configuration descriptions for the CPU
 	 */
-	pmu_conf.counter_pmds[0] |= PMM(4)|PMM(5)|PMM(6)|PMM(7);
+	pmu_conf.pmc_desc = pmc_desc;
+	pmu_conf.pmd_desc = pmd_desc;

-#ifdef CONFIG_ITANIUM
-	pmu_conf.monitor_pmcs[0] |= PMM(10)|PMM(11)|PMM(12);
-	/* Itanium does not add more counters */
-#endif
 	/* we are all set */
 	pmu_conf.pfm_is_disabled = 0;

@@ -3928,6 +4220,8 @@ perfmon_init (void)
 	 */
 	perfmon_dir = create_proc_read_entry ("perfmon", 0, 0, perfmon_read_entry, NULL);

+	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root, 0);
+
 	spin_lock_init(&pfm_sessions.pfs_lock);

 	return 0;
@@ -3942,7 +4236,6 @@ perfmon_init_percpu (void)
 	ia64_srlz_d();
 }

-
 #else /* !CONFIG_PERFMON */

 asmlinkage int

--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -194,13 +194,15 @@ ia64_save_extra (struct task_struct *task)
 		pfm_save_regs(task);

 # ifdef CONFIG_SMP
-	if (local_cpu_data->pfm_syst_wide)
+	if (this_cpu(pfm_syst_wide))
 		pfm_syst_wide_update_task(task, 0);
 # endif
 #endif

+#ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(ia64_task_regs(task)))
 		ia32_save_state(task);
+#endif
 }

 void
@@ -214,12 +216,14 @@ ia64_load_extra (struct task_struct *task)
 		pfm_load_regs(task);

 # ifdef CONFIG_SMP
-	if (local_cpu_data->pfm_syst_wide) pfm_syst_wide_update_task(task, 1);
+	if (this_cpu(pfm_syst_wide)) pfm_syst_wide_update_task(task, 1);
 # endif
 #endif

+#ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(ia64_task_regs(task)))
 		ia32_load_state(task);
+#endif
 }

 /*
@@ -357,6 +361,8 @@ copy_thread (int nr, unsigned long clone_flags,
 	 */
 	atomic_set(&p->thread.pfm_notifiers_check, 0);
 	atomic_set(&p->thread.pfm_owners_check, 0);
+	/* clear list of sampling buffer to free for new task */
+	p->thread.pfm_smpl_buf_list = NULL;

 	if (current->thread.pfm_context) retval = pfm_inherit(p, child_ptregs);
 #endif
@@ -566,9 +572,8 @@ exit_thread (void)
 		pfm_flush_regs(current);

 	/* free debug register resources */
-	if ((current->thread.flags & IA64_THREAD_DBG_VALID) != 0) {
+	if (current->thread.flags & IA64_THREAD_DBG_VALID)
 		pfm_release_debug_registers(current);
-	}
 #endif
 }


--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -559,6 +559,24 @@ cpu_init (void)
 	 */
 	identify_cpu(my_cpu_info);

+#ifdef CONFIG_MCKINLEY
+	{
+#define FEATURE_SET 16
+		struct ia64_pal_retval iprv;
+
+		if (my_cpu_data->family == 0x1f) {
+
+			PAL_CALL_PHYS(iprv, PAL_PROC_GET_FEATURES, 0, FEATURE_SET, 0);
+
+			if ((iprv.status == 0) && (iprv.v0 & 0x80) && (iprv.v2 & 0x80)) {
+
+				PAL_CALL_PHYS(iprv, PAL_PROC_SET_FEATURES,
+				              (iprv.v1 | 0x80), FEATURE_SET, 0);
+			}
+		}
+	}
+#endif
+
 	/* Clear the stack memory reserved for pt_regs: */
 	memset(ia64_task_regs(current), 0, sizeof(struct pt_regs));

@@ -570,7 +588,7 @@ cpu_init (void)
 	 * shouldn't be affected by this (moral: keep your ia32 locks aligned and you'll
 	 * be fine).
 	 */
-	ia64_set_dcr(  IA64_DCR_DM | IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
+	ia64_set_dcr(  IA64_DCR_DP | IA64_DCR_DK | IA64_DCR_DX | IA64_DCR_DR
 		     | IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC);
 #ifndef CONFIG_SMP
 	ia64_set_fpu_owner(0);

--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -143,9 +143,10 @@ copy_siginfo_to_user (siginfo_t *to, siginfo_t *from)
 {
 	if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t)))
 		return -EFAULT;
-	if (from->si_code < 0)
-		return __copy_to_user(to, from, sizeof(siginfo_t));
-	else {
+	if (from->si_code < 0) {
+		if (__copy_to_user(to, from, sizeof(siginfo_t)))
+			return -EFAULT;
+	} else {
 		int err;

 		/*

--- a/arch/ia64/lib/swiotlb.c
+++ b/arch/ia64/lib/swiotlb.c
@@ -478,6 +478,17 @@ swiotlb_dma_address (struct scatterlist *sg)
 	return SG_ENT_PHYS_ADDRESS(sg);
 }

+/*
+ * Return whether the given PCI device DMA address mask can be supported properly.  For
+ * example, if your device can only drive the low 24-bits during PCI bus mastering, then
+ * you would pass 0x00ffffff as the mask to this function.
+ */
+int
+swiotlb_pci_dma_supported (struct pci_dev *hwdev, u64 mask)
+{
+	return 1;
+}
+
 EXPORT_SYMBOL(swiotlb_init);
 EXPORT_SYMBOL(swiotlb_map_single);
 EXPORT_SYMBOL(swiotlb_unmap_single);
@@ -488,3 +499,4 @@ EXPORT_SYMBOL(swiotlb_sync_sg);
 EXPORT_SYMBOL(swiotlb_dma_address);
 EXPORT_SYMBOL(swiotlb_alloc_consistent);
 EXPORT_SYMBOL(swiotlb_free_consistent);
+EXPORT_SYMBOL(swiotlb_pci_dma_supported);
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -251,7 +251,7 @@ put_gate_page (struct page *page, unsigned long address)
 void __init
 ia64_mmu_init (void *my_cpu_data)
 {
-	unsigned long flags, rid, pta, impl_va_bits;
+	unsigned long psr, rid, pta, impl_va_bits;
 	extern void __init tlb_init (void);
 #ifdef CONFIG_DISABLE_VHPT
 #	define VHPT_ENABLE_BIT	0
@@ -263,7 +263,7 @@ ia64_mmu_init (void *my_cpu_data)
 	 * Set up the kernel identity mapping for regions 6 and 5.  The mapping for region
 	 * 7 is setup up in _start().
 	 */
-	ia64_clear_ic(flags);
+	psr = ia64_clear_ic();

 	rid = ia64_rid(IA64_REGION_ID_KERNEL, __IA64_UNCACHED_OFFSET);
 	ia64_set_rr(__IA64_UNCACHED_OFFSET, (rid << 8) | (IA64_GRANULE_SHIFT << 2));
@@ -277,7 +277,7 @@ ia64_mmu_init (void *my_cpu_data)
 	ia64_itr(0x2, IA64_TR_PERCPU_DATA, PERCPU_ADDR,
 		 pte_val(pfn_pte(__pa(my_cpu_data) >> PAGE_SHIFT, PAGE_KERNEL)), PAGE_SHIFT);

-	__restore_flags(flags);
+	ia64_set_psr(psr);
 	ia64_srlz_i();

 	/*

--- a/arch/ia64/sn/io/Makefile
+++ b/arch/ia64/sn/io/Makefile
@@ -18,7 +18,7 @@ EXTRA_CFLAGS    := -DLITTLE_ENDIAN
 O_TARGET := sgiio.o

 ifeq ($(CONFIG_MODULES),y)
-export-objs = pciio.o hcl.o
+export-objs = pciio.o hcl.o pci_dma.o
 endif

 obj-y  := stubs.o sgi_if.o pciio.o xtalk.o xbow.o xswitch.o klgraph_hack.o \

--- a/arch/ia64/sn/io/pci_dma.c
+++ b/arch/ia64/sn/io/pci_dma.c
@@ -4,6 +4,9 @@
 * for more details.
 *
 * Copyright (C) 2000,2002 Silicon Graphics, Inc. All rights reserved.
+ *
+ * Routines for PCI DMA mapping.  See Documentation/DMA-mapping.txt for
+ * a description of how these routines should be used.
 */

 #include <linux/types.h>
@@ -12,6 +15,7 @@
 #include <linux/pci.h>
 #include <linux/slab.h>
 #include <linux/devfs_fs_kernel.h>
+#include <linux/module.h>

 #include <asm/delay.h>
 #include <asm/io.h>
@@ -46,7 +50,7 @@ get_free_pciio_dmamap(devfs_handle_t pci_bus)
 	/*
 	 * Darn, we need to get the maps allocated for this bus.
 	 */
-	for (i=0; i<MAX_PCI_XWIDGET; i++) {
+	for (i = 0; i < MAX_PCI_XWIDGET; i++) {
 		if (busnum_to_pcibr_vhdl[i] == pci_bus) {
 			sn1_dma_map = busnum_to_atedmamaps[i];
 		}
@@ -314,22 +318,18 @@ sn1_pci_map_sg (struct pci_dev *hwdev,
 		}

 		/*
-		 * It is a 32bit card and we cannot do Direct mapping.
-		 * Let's 32Bit Page map the request.
+		 * It is a 32 bit card and we cannot do direct mapping,
+		 * so we use an ATE.
 		 */
-		dma_map = NULL;
-#ifdef CONFIG_IA64_SGI_SN1
-		dma_map = pciio_dmamap_alloc(vhdl, NULL, sg->length, 
-				PCIIO_BYTE_STREAM | PCIIO_DMA_DATA);
-#else
-		dma_map = pciio_dmamap_alloc(vhdl, NULL, sg->length, PCIIO_DMA_DATA);
-#endif
+		dma_map = 0;
+		dma_map = pciio_dmamap_alloc(vhdl, NULL, sg->length,
+					     DMA_DATA_FLAGS);
 		if (!dma_map) {
-			printk("pci_map_sg: Unable to allocate anymore 32Bits Page Map entries.\n");
+			printk(KERN_ERR "sn_pci_map_sg: Unable to allocate "
+			       "anymore 32 bit page map entries.\n");
 			BUG();
 		}
-		dma_addr = (dma_addr_t)pciio_dmamap_addr(dma_map, temp_ptr, sg->length);
-		/* printk("pci_map_sg: dma_map 0x%p Phys Addr 0x%p dma_addr 0x%p\n", dma_map, temp_ptr, dma_addr); */
+		dma_addr = pciio_dmamap_addr(dma_map, phys_addr, sg->length);
 		sg->address = (char *)dma_addr;
 		sg->page = (char *)dma_map;
 		
@@ -372,7 +372,17 @@ sn1_pci_unmap_sg (struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int

 }

-/*
+/**
+ * sn_pci_map_single - map a single region for DMA
+ * @hwdev: device to map for
+ * @ptr: kernel virtual address of the region to map
+ * @size: size of the region
+ * @direction: DMA direction
+ *
+ * Map the region pointed to by @ptr for DMA and return the
+ * DMA address.   Also known as platform_pci_map_single() by
+ * the IA64 machvec code.
+ *
 * We map this to the one step pciio_dmamap_trans interface rather than
 * the two step pciio_dmamap_alloc/pciio_dmamap_addr because we have
 * no way of saving the dmamap handle from the alloc to later free

--- a/arch/ia64/sn/kernel/misctest.c
+++ b/arch/ia64/sn/kernel/misctest.c
@@ -75,7 +75,7 @@ sgi_mcatest(void)
 	if (mcatest == 5) {
 		int zzzspec(long);
 		int	i;
-		long	flags, dcr, res, val, addr=0xff00000000UL;
+		long	psr, dcr, res, val, addr=0xff00000000UL;

 		dcr = ia64_get_dcr();
 		for (i=0; i<5; i++) {
@@ -87,11 +87,11 @@ sgi_mcatest(void)
 			ia64_set_dcr(dcr);
 			res = ia64_sn_probe_io_slot(0xff00000000UL, 8, &val);
 			printk("zzzspec: probe %ld, 0x%lx\n", res, val);
-			ia64_clear_ic(flags);
+			psr = ia64_clear_ic();
 			ia64_itc(0x2, 0xe00000ff00000000UL,
 			          pte_val(pfn_pte(0xff00000000UL >> PAGE_SHIFT,
 				  __pgprot(__DIRTY_BITS|_PAGE_PL_0|_PAGE_AR_RW))), _PAGE_SIZE_256M);
-			local_irq_restore(flags);
+			ia64_set_psr(psr);
 			ia64_srlz_i ();
 		}


--- a/arch/ia64/vmlinux.lds.S
+++ b/arch/ia64/vmlinux.lds.S
@@ -41,7 +41,8 @@ SECTIONS

  /* Read-only data */

-  __gp = ALIGN(16) + 0x200000;	/* gp must be 16-byte aligned for exc. table */
+  . = ALIGN(16);
+  __gp = . + 0x200000;	/* gp must be 16-byte aligned for exc. table */

  /* Global data */
  _data = .;

--- a/include/asm-ia64/efi.h
+++ b/include/asm-ia64/efi.h
@@ -7,9 +7,9 @@
 *
 * Copyright (C) 1999 VA Linux Systems
 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
- * Copyright (C) 1999 Hewlett-Packard Co.
- * Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (C) 1999, 2002 Hewlett-Packard Co.
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
+ *	Stephane Eranian <eranian@hpl.hp.com>
 */
 #include <linux/init.h>
 #include <linux/string.h>
@@ -258,8 +258,9 @@ extern void efi_map_pal_code (void);
 extern void efi_memmap_walk (efi_freemem_callback_t callback, void *arg);
 extern void efi_gettimeofday (struct timeval *tv);
 extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if possible */
-extern u64  efi_get_iobase (void);
-extern u32  efi_mem_type (u64 phys_addr);
+extern u64 efi_get_iobase (void);
+extern u32 efi_mem_type (unsigned long phys_addr);
+extern u64 efi_mem_attributes (unsigned long phys_addr);

 /*
 * Variable Attributes

--- a/include/asm-ia64/elf.h
+++ b/include/asm-ia64/elf.h
@@ -38,7 +38,7 @@
 * the way of the program that it will "exec", and that there is
 * sufficient room for the brk.
 */
-#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_BASE + 0x1000000)
+#define ELF_ET_DYN_BASE		(TASK_UNMAPPED_BASE + 0x800000000)


 /*

--- a/include/asm-ia64/kregs.h
+++ b/include/asm-ia64/kregs.h
@@ -2,8 +2,8 @@
 #define _ASM_IA64_KREGS_H

 /*
- * Copyright (C) 2001 Hewlett-Packard Co
- * Copyright (C) 2001 David Mosberger-Tang <davidm@hpl.hp.com>
+ * Copyright (C) 2001-2002 Hewlett-Packard Co
+ *	David Mosberger-Tang <davidm@hpl.hp.com>
 */
 /*
 * This file defines the kernel register usage convention used by Linux/ia64.
@@ -31,4 +31,121 @@
 #define IA64_TR_PERCPU_DATA	1	/* dtr1: percpu data */
 #define IA64_TR_CURRENT_STACK	2	/* dtr2: maps kernel's memory- & register-stacks */

+/* Processor status register bits: */
+#define IA64_PSR_BE_BIT		1
+#define IA64_PSR_UP_BIT		2
+#define IA64_PSR_AC_BIT		3
+#define IA64_PSR_MFL_BIT	4
+#define IA64_PSR_MFH_BIT	5
+#define IA64_PSR_IC_BIT		13
+#define IA64_PSR_I_BIT		14
+#define IA64_PSR_PK_BIT		15
+#define IA64_PSR_DT_BIT		17
+#define IA64_PSR_DFL_BIT	18
+#define IA64_PSR_DFH_BIT	19
+#define IA64_PSR_SP_BIT		20
+#define IA64_PSR_PP_BIT		21
+#define IA64_PSR_DI_BIT		22
+#define IA64_PSR_SI_BIT		23
+#define IA64_PSR_DB_BIT		24
+#define IA64_PSR_LP_BIT		25
+#define IA64_PSR_TB_BIT		26
+#define IA64_PSR_RT_BIT		27
+/* The following are not affected by save_flags()/restore_flags(): */
+#define IA64_PSR_CPL0_BIT	32
+#define IA64_PSR_CPL1_BIT	33
+#define IA64_PSR_IS_BIT		34
+#define IA64_PSR_MC_BIT		35
+#define IA64_PSR_IT_BIT		36
+#define IA64_PSR_ID_BIT		37
+#define IA64_PSR_DA_BIT		38
+#define IA64_PSR_DD_BIT		39
+#define IA64_PSR_SS_BIT		40
+#define IA64_PSR_RI_BIT		41
+#define IA64_PSR_ED_BIT		43
+#define IA64_PSR_BN_BIT		44
+
+#define IA64_PSR_BE	(__IA64_UL(1) << IA64_PSR_BE_BIT)
+#define IA64_PSR_UP	(__IA64_UL(1) << IA64_PSR_UP_BIT)
+#define IA64_PSR_AC	(__IA64_UL(1) << IA64_PSR_AC_BIT)
+#define IA64_PSR_MFL	(__IA64_UL(1) << IA64_PSR_MFL_BIT)
+#define IA64_PSR_MFH	(__IA64_UL(1) << IA64_PSR_MFH_BIT)
+#define IA64_PSR_IC	(__IA64_UL(1) << IA64_PSR_IC_BIT)
+#define IA64_PSR_I	(__IA64_UL(1) << IA64_PSR_I_BIT)
+#define IA64_PSR_PK	(__IA64_UL(1) << IA64_PSR_PK_BIT)
+#define IA64_PSR_DT	(__IA64_UL(1) << IA64_PSR_DT_BIT)
+#define IA64_PSR_DFL	(__IA64_UL(1) << IA64_PSR_DFL_BIT)
+#define IA64_PSR_DFH	(__IA64_UL(1) << IA64_PSR_DFH_BIT)
+#define IA64_PSR_SP	(__IA64_UL(1) << IA64_PSR_SP_BIT)
+#define IA64_PSR_PP	(__IA64_UL(1) << IA64_PSR_PP_BIT)
+#define IA64_PSR_DI	(__IA64_UL(1) << IA64_PSR_DI_BIT)
+#define IA64_PSR_SI	(__IA64_UL(1) << IA64_PSR_SI_BIT)
+#define IA64_PSR_DB	(__IA64_UL(1) << IA64_PSR_DB_BIT)
+#define IA64_PSR_LP	(__IA64_UL(1) << IA64_PSR_LP_BIT)
+#define IA64_PSR_TB	(__IA64_UL(1) << IA64_PSR_TB_BIT)
+#define IA64_PSR_RT	(__IA64_UL(1) << IA64_PSR_RT_BIT)
+/* The following are not affected by save_flags()/restore_flags(): */
+#define IA64_PSR_IS	(__IA64_UL(1) << IA64_PSR_IS_BIT)
+#define IA64_PSR_MC	(__IA64_UL(1) << IA64_PSR_MC_BIT)
+#define IA64_PSR_IT	(__IA64_UL(1) << IA64_PSR_IT_BIT)
+#define IA64_PSR_ID	(__IA64_UL(1) << IA64_PSR_ID_BIT)
+#define IA64_PSR_DA	(__IA64_UL(1) << IA64_PSR_DA_BIT)
+#define IA64_PSR_DD	(__IA64_UL(1) << IA64_PSR_DD_BIT)
+#define IA64_PSR_SS	(__IA64_UL(1) << IA64_PSR_SS_BIT)
+#define IA64_PSR_RI	(__IA64_UL(3) << IA64_PSR_RI_BIT)
+#define IA64_PSR_ED	(__IA64_UL(1) << IA64_PSR_ED_BIT)
+#define IA64_PSR_BN	(__IA64_UL(1) << IA64_PSR_BN_BIT)
+
+/* User mask bits: */
+#define IA64_PSR_UM	(IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH)
+
+/* Default Control Register */
+#define IA64_DCR_PP_BIT		 0	/* privileged performance monitor default */
+#define IA64_DCR_BE_BIT		 1	/* big-endian default */
+#define IA64_DCR_LC_BIT		 2	/* ia32 lock-check enable */
+#define IA64_DCR_DM_BIT		 8	/* defer TLB miss faults */
+#define IA64_DCR_DP_BIT		 9	/* defer page-not-present faults */
+#define IA64_DCR_DK_BIT		10	/* defer key miss faults */
+#define IA64_DCR_DX_BIT		11	/* defer key permission faults */
+#define IA64_DCR_DR_BIT		12	/* defer access right faults */
+#define IA64_DCR_DA_BIT		13	/* defer access bit faults */
+#define IA64_DCR_DD_BIT		14	/* defer debug faults */
+
+#define IA64_DCR_PP	(__IA64_UL(1) << IA64_DCR_PP_BIT)
+#define IA64_DCR_BE	(__IA64_UL(1) << IA64_DCR_BE_BIT)
+#define IA64_DCR_LC	(__IA64_UL(1) << IA64_DCR_LC_BIT)
+#define IA64_DCR_DM	(__IA64_UL(1) << IA64_DCR_DM_BIT)
+#define IA64_DCR_DP	(__IA64_UL(1) << IA64_DCR_DP_BIT)
+#define IA64_DCR_DK	(__IA64_UL(1) << IA64_DCR_DK_BIT)
+#define IA64_DCR_DX	(__IA64_UL(1) << IA64_DCR_DX_BIT)
+#define IA64_DCR_DR	(__IA64_UL(1) << IA64_DCR_DR_BIT)
+#define IA64_DCR_DA	(__IA64_UL(1) << IA64_DCR_DA_BIT)
+#define IA64_DCR_DD	(__IA64_UL(1) << IA64_DCR_DD_BIT)
+
+/* Interrupt Status Register */
+#define IA64_ISR_X_BIT		32	/* execute access */
+#define IA64_ISR_W_BIT		33	/* write access */
+#define IA64_ISR_R_BIT		34	/* read access */
+#define IA64_ISR_NA_BIT		35	/* non-access */
+#define IA64_ISR_SP_BIT		36	/* speculative load exception */
+#define IA64_ISR_RS_BIT		37	/* mandatory register-stack exception */
+#define IA64_ISR_IR_BIT		38	/* invalid register frame exception */
+#define IA64_ISR_CODE_MASK	0xf
+
+#define IA64_ISR_X	(__IA64_UL(1) << IA64_ISR_X_BIT)
+#define IA64_ISR_W	(__IA64_UL(1) << IA64_ISR_W_BIT)
+#define IA64_ISR_R	(__IA64_UL(1) << IA64_ISR_R_BIT)
+#define IA64_ISR_NA	(__IA64_UL(1) << IA64_ISR_NA_BIT)
+#define IA64_ISR_SP	(__IA64_UL(1) << IA64_ISR_SP_BIT)
+#define IA64_ISR_RS	(__IA64_UL(1) << IA64_ISR_RS_BIT)
+#define IA64_ISR_IR	(__IA64_UL(1) << IA64_ISR_IR_BIT)
+
+/* ISR code field for non-access instructions */
+#define IA64_ISR_CODE_TPA	0
+#define IA64_ISR_CODE_FC	1
+#define IA64_ISR_CODE_PROBE	2
+#define IA64_ISR_CODE_TAK	3
+#define IA64_ISR_CODE_LFETCH	4
+#define IA64_ISR_CODE_PROBEF	5
+
 #endif /* _ASM_IA64_kREGS_H */
--- a/include/asm-ia64/machvec.h
+++ b/include/asm-ia64/machvec.h
@@ -18,6 +18,7 @@ struct pci_dev;
 struct pt_regs;
 struct scatterlist;
 struct irq_desc;
+struct page;

 typedef void ia64_mv_setup_t (char **);
 typedef void ia64_mv_cpu_init_t(void);
@@ -45,6 +46,8 @@ typedef void ia64_mv_pci_unmap_sg (struct pci_dev *, struct scatterlist *, int,
 typedef void ia64_mv_pci_dma_sync_single (struct pci_dev *, dma_addr_t, size_t, int);
 typedef void ia64_mv_pci_dma_sync_sg (struct pci_dev *, struct scatterlist *, int, int);
 typedef unsigned long ia64_mv_pci_dma_address (struct scatterlist *);
+typedef int ia64_mv_pci_dma_supported (struct pci_dev *, u64);
+
 /*
 * WARNING: The legacy I/O space is _architected_.  Platforms are
 * expected to follow this architected model (see Section 10.7 in the
@@ -101,6 +104,7 @@ extern void machvec_noop (void);
 #  define platform_pci_dma_sync_single	ia64_mv.sync_single
 #  define platform_pci_dma_sync_sg	ia64_mv.sync_sg
 #  define platform_pci_dma_address	ia64_mv.dma_address
+#  define platform_pci_dma_supported	ia64_mv.dma_supported
 #  define platform_irq_desc		ia64_mv.irq_desc
 #  define platform_irq_to_vector	ia64_mv.irq_to_vector
 #  define platform_local_vector_to_irq	ia64_mv.local_vector_to_irq
@@ -136,6 +140,7 @@ struct ia64_machine_vector {
 	ia64_mv_pci_dma_sync_single *sync_single;
 	ia64_mv_pci_dma_sync_sg *sync_sg;
 	ia64_mv_pci_dma_address *dma_address;
+	ia64_mv_pci_dma_supported *dma_supported;
 	ia64_mv_irq_desc *irq_desc;
 	ia64_mv_irq_to_vector *irq_to_vector;
 	ia64_mv_local_vector_to_irq *local_vector_to_irq;
@@ -172,6 +177,7 @@ struct ia64_machine_vector {
 	platform_pci_dma_sync_single,		\
 	platform_pci_dma_sync_sg,		\
 	platform_pci_dma_address,		\
+	platform_pci_dma_supported,		\
 	platform_irq_desc,			\
 	platform_irq_to_vector,			\
 	platform_local_vector_to_irq,		\
@@ -269,6 +275,9 @@ extern ia64_mv_pci_dma_address swiotlb_dma_address;
 #ifndef platform_pci_dma_address
 # define  platform_pci_dma_address	swiotlb_dma_address
 #endif
+#ifndef platform_pci_dma_supported
+# define  platform_pci_dma_supported	swiotlb_pci_dma_supported
+#endif
 #ifndef platform_irq_desc
 # define platform_irq_desc		__ia64_irq_desc
 #endif

--- a/include/asm-ia64/machvec_hpzx1.h
+++ b/include/asm-ia64/machvec_hpzx1.h
@@ -11,6 +11,7 @@ extern ia64_mv_pci_unmap_single sba_unmap_single;
 extern ia64_mv_pci_map_sg sba_map_sg;
 extern ia64_mv_pci_unmap_sg sba_unmap_sg;
 extern ia64_mv_pci_dma_address sba_dma_address;
+extern ia64_mv_pci_dma_supported sba_dma_supported;

 /*
 * This stuff has dual use!
@@ -33,42 +34,6 @@ extern ia64_mv_pci_dma_address sba_dma_address;
 #define platform_pci_dma_sync_single	((ia64_mv_pci_dma_sync_single *) machvec_noop)
 #define platform_pci_dma_sync_sg	((ia64_mv_pci_dma_sync_sg *) machvec_noop)
 #define platform_pci_dma_address	sba_dma_address
-
-#endif /* _ASM_IA64_MACHVEC_HPZX1_h */
-#ifndef _ASM_IA64_MACHVEC_HPZX1_h
-#define _ASM_IA64_MACHVEC_HPZX1_h
-
-extern ia64_mv_setup_t dig_setup;
-extern ia64_mv_pci_fixup_t hpzx1_pci_fixup;
-extern ia64_mv_map_nr_t map_nr_dense;
-extern ia64_mv_pci_alloc_consistent sba_alloc_consistent;
-extern ia64_mv_pci_free_consistent sba_free_consistent;
-extern ia64_mv_pci_map_single sba_map_single;
-extern ia64_mv_pci_unmap_single sba_unmap_single;
-extern ia64_mv_pci_map_sg sba_map_sg;
-extern ia64_mv_pci_unmap_sg sba_unmap_sg;
-extern ia64_mv_pci_dma_address sba_dma_address;
-
-/*
- * This stuff has dual use!
- *
- * For a generic kernel, the macros are used to initialize the
- * platform's machvec structure.  When compiling a non-generic kernel,
- * the macros are used directly.
- */
-#define platform_name			"hpzx1"
-#define platform_setup			dig_setup
-#define platform_pci_fixup		hpzx1_pci_fixup
-#define platform_map_nr			map_nr_dense
-#define platform_pci_dma_init		((ia64_mv_pci_dma_init *) machvec_noop)
-#define platform_pci_alloc_consistent	sba_alloc_consistent
-#define platform_pci_free_consistent	sba_free_consistent
-#define platform_pci_map_single		sba_map_single
-#define platform_pci_unmap_single	sba_unmap_single
-#define platform_pci_map_sg		sba_map_sg
-#define platform_pci_unmap_sg		sba_unmap_sg
-#define platform_pci_dma_sync_single	((ia64_mv_pci_dma_sync_single *) machvec_noop)
-#define platform_pci_dma_sync_sg	((ia64_mv_pci_dma_sync_sg *) machvec_noop)
-#define platform_pci_dma_address	sba_dma_address
+#define platform_pci_dma_supported	sba_dma_supported

 #endif /* _ASM_IA64_MACHVEC_HPZX1_h */
--- a/include/asm-ia64/offsets.h
+++ b/include/asm-ia64/offsets.h
@@ -106,6 +106,7 @@
 #define IA64_SWITCH_STACK_AR_RNAT_OFFSET 536	/* 0x218 */
 #define IA64_SWITCH_STACK_AR_BSPSTORE_OFFSET 544	/* 0x220 */
 #define IA64_SWITCH_STACK_PR_OFFSET	552	/* 0x228 */
+#define IA64_SIGCONTEXT_IP_OFFSET	40	/* 0x28 */
 #define IA64_SIGCONTEXT_AR_BSP_OFFSET	72	/* 0x48 */
 #define IA64_SIGCONTEXT_AR_FPSR_OFFSET	104	/* 0x68 */
 #define IA64_SIGCONTEXT_AR_RNAT_OFFSET	80	/* 0x50 */

--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -68,20 +68,27 @@ do {						\
 */
 #define MAP_NR_DENSE(addr)	(((unsigned long) (addr) - PAGE_OFFSET) >> PAGE_SHIFT)

+#define page_to_pfn(page)	((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn)		((pfn) < max_mapnr)
+#define virt_addr_valid(kaddr)	pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
+
 #ifdef CONFIG_IA64_GENERIC
 # include <asm/machvec.h>
 # define virt_to_page(kaddr)	(mem_map + platform_map_nr(kaddr))
-# define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
+# define page_to_pfn(page)	((unsigned long) (page - mem_map))
+# define pfn_to_page(pfn)	(mem_map + (pfn))
 #elif defined (CONFIG_IA64_SGI_SN1)
 # ifndef CONFIG_DISCONTIGMEM
 #  define virt_to_page(kaddr)	(mem_map + MAP_NR_DENSE(kaddr))
-#  define page_to_phys(page)	XXX fix me
+#  define page_to_pfn(page)	XXX fix me
+#  define pfn_to_page(pfn)	XXX fix me
 # endif
 #else
 # define virt_to_page(kaddr)	(mem_map + MAP_NR_DENSE(kaddr))
-# define page_to_phys(page)	((page - mem_map) << PAGE_SHIFT)
+# define page_to_pfn(page)	((unsigned long) (page - mem_map))
+# define pfn_to_page(pfn)	(mem_map + (pfn))
 #endif
-#define VALID_PAGE(page)	((page - mem_map) < max_mapnr)

 typedef union ia64_va {
 	struct {
@@ -105,7 +112,7 @@ typedef union ia64_va {
 #define REGION_OFFSET(x)	({ia64_va _v; _v.l = (long) (x); _v.f.off;})

 #define REGION_SIZE		REGION_NUMBER(1)
-#define REGION_KERNEL	7
+#define REGION_KERNEL		7

 #define BUG() do { printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); *(int *)0=0; } while (0)
 #define PAGE_BUG(page) do { BUG(); } while (0)

--- a/include/asm-ia64/pci.h
+++ b/include/asm-ia64/pci.h
@@ -58,6 +58,7 @@ pcibios_penalize_isa_irq (int irq)
 #define pci_dma_sync_single		platform_pci_dma_sync_single
 #define pci_dma_sync_sg			platform_pci_dma_sync_sg
 #define sg_dma_address			platform_pci_dma_address
+#define pci_dma_supported		platform_pci_dma_supported

 /* pci_unmap_{single,page} is not a nop, thus... */
 #define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)	\
@@ -73,17 +74,6 @@ pcibios_penalize_isa_irq (int irq)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)		\
 	(((PTR)->LEN_NAME) = (VAL))

-/*
- * Return whether the given PCI device DMA address mask can be supported properly.  For
- * example, if your device can only drive the low 24-bits during PCI bus mastering, then
- * you would pass 0x00ffffff as the mask to this function.
- */
-static inline int
-pci_dma_supported (struct pci_dev *hwdev, u64 mask)
-{
-	return 1;
-}
-
 #define pci_map_page(dev,pg,off,size,dir)				\
 	pci_map_single((dev), page_address(pg) + (off), (size), (dir))
 #define pci_unmap_page(dev,dma_addr,size,dir)				\

--- a/include/asm-ia64/perfmon.h
+++ b/include/asm-ia64/perfmon.h
@@ -23,6 +23,7 @@
 #define PFM_GET_FEATURES	0x0c
 #define PFM_DEBUG		0x0d
 #define PFM_UNPROTECT_CONTEXT	0x0e
+#define PFM_GET_PMC_RESET_VAL	0x0f


 /*
@@ -173,6 +174,8 @@ extern int  pfm_cleanup_smpl_buf(struct task_struct *);
 extern void pfm_syst_wide_update_task(struct task_struct *, int);
 extern void pfm_ovfl_block_reset (void);

+extern int pfm_syst_wide;
+
 #endif /* __KERNEL__ */

 #endif /* _ASM_IA64_PERFMON_H */
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -207,20 +207,14 @@ ia64_phys_addr_valid (unsigned long addr)
 #define VMALLOC_END		(0xa000000000000000 + (1UL << (4*PAGE_SHIFT - 9)))

 /*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
+ * Conversion functions: convert page frame number (pfn) and a protection value to a page
+ * table entry (pte).
 */
-#define mk_pte(page,pgprot)						\
-({									\
-	pte_t __pte;							\
-									\
-	pte_val(__pte) = (page_to_phys(page)) | pgprot_val(pgprot);	\
-	__pte;								\
-})
-
-/* This takes a physical page address that is used by the remapping functions */
-#define mk_pte_phys(physpage, pgprot) \
-({ pte_t __pte; pte_val(__pte) = physpage + pgprot_val(pgprot); __pte; })
+#define pfn_pte(pfn, pgprot) \
+({ pte_t __pte; pte_val(__pte) = ((pfn) << PAGE_SHIFT) | pgprot_val(pgprot); __pte; })
+
+/* Extract pfn from pte.  */
+#define pte_pfn(_pte)		((pte_val(_pte) & _PFN_MASK) >> PAGE_SHIFT)

 #define pte_modify(_pte, newprot) \
 	(__pte((pte_val(_pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)))

--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -55,123 +55,6 @@
 #define MCA_bus 0
 #define MCA_bus__is_a_macro /* for versions in ksyms.c */

-/* Processor status register bits: */
-#define IA64_PSR_BE_BIT		1
-#define IA64_PSR_UP_BIT		2
-#define IA64_PSR_AC_BIT		3
-#define IA64_PSR_MFL_BIT	4
-#define IA64_PSR_MFH_BIT	5
-#define IA64_PSR_IC_BIT		13
-#define IA64_PSR_I_BIT		14
-#define IA64_PSR_PK_BIT		15
-#define IA64_PSR_DT_BIT		17
-#define IA64_PSR_DFL_BIT	18
-#define IA64_PSR_DFH_BIT	19
-#define IA64_PSR_SP_BIT		20
-#define IA64_PSR_PP_BIT		21
-#define IA64_PSR_DI_BIT		22
-#define IA64_PSR_SI_BIT		23
-#define IA64_PSR_DB_BIT		24
-#define IA64_PSR_LP_BIT		25
-#define IA64_PSR_TB_BIT		26
-#define IA64_PSR_RT_BIT		27
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_CPL0_BIT	32
-#define IA64_PSR_CPL1_BIT	33
-#define IA64_PSR_IS_BIT		34
-#define IA64_PSR_MC_BIT		35
-#define IA64_PSR_IT_BIT		36
-#define IA64_PSR_ID_BIT		37
-#define IA64_PSR_DA_BIT		38
-#define IA64_PSR_DD_BIT		39
-#define IA64_PSR_SS_BIT		40
-#define IA64_PSR_RI_BIT		41
-#define IA64_PSR_ED_BIT		43
-#define IA64_PSR_BN_BIT		44
-
-#define IA64_PSR_BE	(__IA64_UL(1) << IA64_PSR_BE_BIT)
-#define IA64_PSR_UP	(__IA64_UL(1) << IA64_PSR_UP_BIT)
-#define IA64_PSR_AC	(__IA64_UL(1) << IA64_PSR_AC_BIT)
-#define IA64_PSR_MFL	(__IA64_UL(1) << IA64_PSR_MFL_BIT)
-#define IA64_PSR_MFH	(__IA64_UL(1) << IA64_PSR_MFH_BIT)
-#define IA64_PSR_IC	(__IA64_UL(1) << IA64_PSR_IC_BIT)
-#define IA64_PSR_I	(__IA64_UL(1) << IA64_PSR_I_BIT)
-#define IA64_PSR_PK	(__IA64_UL(1) << IA64_PSR_PK_BIT)
-#define IA64_PSR_DT	(__IA64_UL(1) << IA64_PSR_DT_BIT)
-#define IA64_PSR_DFL	(__IA64_UL(1) << IA64_PSR_DFL_BIT)
-#define IA64_PSR_DFH	(__IA64_UL(1) << IA64_PSR_DFH_BIT)
-#define IA64_PSR_SP	(__IA64_UL(1) << IA64_PSR_SP_BIT)
-#define IA64_PSR_PP	(__IA64_UL(1) << IA64_PSR_PP_BIT)
-#define IA64_PSR_DI	(__IA64_UL(1) << IA64_PSR_DI_BIT)
-#define IA64_PSR_SI	(__IA64_UL(1) << IA64_PSR_SI_BIT)
-#define IA64_PSR_DB	(__IA64_UL(1) << IA64_PSR_DB_BIT)
-#define IA64_PSR_LP	(__IA64_UL(1) << IA64_PSR_LP_BIT)
-#define IA64_PSR_TB	(__IA64_UL(1) << IA64_PSR_TB_BIT)
-#define IA64_PSR_RT	(__IA64_UL(1) << IA64_PSR_RT_BIT)
-/* The following are not affected by save_flags()/restore_flags(): */
-#define IA64_PSR_IS	(__IA64_UL(1) << IA64_PSR_IS_BIT)
-#define IA64_PSR_MC	(__IA64_UL(1) << IA64_PSR_MC_BIT)
-#define IA64_PSR_IT	(__IA64_UL(1) << IA64_PSR_IT_BIT)
-#define IA64_PSR_ID	(__IA64_UL(1) << IA64_PSR_ID_BIT)
-#define IA64_PSR_DA	(__IA64_UL(1) << IA64_PSR_DA_BIT)
-#define IA64_PSR_DD	(__IA64_UL(1) << IA64_PSR_DD_BIT)
-#define IA64_PSR_SS	(__IA64_UL(1) << IA64_PSR_SS_BIT)
-#define IA64_PSR_RI	(__IA64_UL(3) << IA64_PSR_RI_BIT)
-#define IA64_PSR_ED	(__IA64_UL(1) << IA64_PSR_ED_BIT)
-#define IA64_PSR_BN	(__IA64_UL(1) << IA64_PSR_BN_BIT)
-
-/* User mask bits: */
-#define IA64_PSR_UM	(IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL | IA64_PSR_MFH)
-
-/* Default Control Register */
-#define IA64_DCR_PP_BIT		 0	/* privileged performance monitor default */
-#define IA64_DCR_BE_BIT		 1	/* big-endian default */
-#define IA64_DCR_LC_BIT		 2	/* ia32 lock-check enable */
-#define IA64_DCR_DM_BIT		 8	/* defer TLB miss faults */
-#define IA64_DCR_DP_BIT		 9	/* defer page-not-present faults */
-#define IA64_DCR_DK_BIT		10	/* defer key miss faults */
-#define IA64_DCR_DX_BIT		11	/* defer key permission faults */
-#define IA64_DCR_DR_BIT		12	/* defer access right faults */
-#define IA64_DCR_DA_BIT		13	/* defer access bit faults */
-#define IA64_DCR_DD_BIT		14	/* defer debug faults */
-
-#define IA64_DCR_PP	(__IA64_UL(1) << IA64_DCR_PP_BIT)
-#define IA64_DCR_BE	(__IA64_UL(1) << IA64_DCR_BE_BIT)
-#define IA64_DCR_LC	(__IA64_UL(1) << IA64_DCR_LC_BIT)
-#define IA64_DCR_DM	(__IA64_UL(1) << IA64_DCR_DM_BIT)
-#define IA64_DCR_DP	(__IA64_UL(1) << IA64_DCR_DP_BIT)
-#define IA64_DCR_DK	(__IA64_UL(1) << IA64_DCR_DK_BIT)
-#define IA64_DCR_DX	(__IA64_UL(1) << IA64_DCR_DX_BIT)
-#define IA64_DCR_DR	(__IA64_UL(1) << IA64_DCR_DR_BIT)
-#define IA64_DCR_DA	(__IA64_UL(1) << IA64_DCR_DA_BIT)
-#define IA64_DCR_DD	(__IA64_UL(1) << IA64_DCR_DD_BIT)
-
-/* Interrupt Status Register */
-#define IA64_ISR_X_BIT		32	/* execute access */
-#define IA64_ISR_W_BIT		33	/* write access */
-#define IA64_ISR_R_BIT		34	/* read access */
-#define IA64_ISR_NA_BIT		35	/* non-access */
-#define IA64_ISR_SP_BIT		36	/* speculative load exception */
-#define IA64_ISR_RS_BIT		37	/* mandatory register-stack exception */
-#define IA64_ISR_IR_BIT		38	/* invalid register frame exception */
-#define IA64_ISR_CODE_MASK	0xf
-
-#define IA64_ISR_X	(__IA64_UL(1) << IA64_ISR_X_BIT)
-#define IA64_ISR_W	(__IA64_UL(1) << IA64_ISR_W_BIT)
-#define IA64_ISR_R	(__IA64_UL(1) << IA64_ISR_R_BIT)
-#define IA64_ISR_NA	(__IA64_UL(1) << IA64_ISR_NA_BIT)
-#define IA64_ISR_SP	(__IA64_UL(1) << IA64_ISR_SP_BIT)
-#define IA64_ISR_RS	(__IA64_UL(1) << IA64_ISR_RS_BIT)
-#define IA64_ISR_IR	(__IA64_UL(1) << IA64_ISR_IR_BIT)
-
-/* ISR code field for non-access instructions */
-#define IA64_ISR_CODE_TPA	0
-#define IA64_ISR_CODE_FC	1
-#define IA64_ISR_CODE_PROBE	2
-#define IA64_ISR_CODE_TAK	3
-#define IA64_ISR_CODE_LFETCH	4
-#define IA64_ISR_CODE_PROBEF	5
-
 #define IA64_THREAD_FPH_VALID	(__IA64_UL(1) << 0)	/* floating-point high state valid? */
 #define IA64_THREAD_DBG_VALID	(__IA64_UL(1) << 1)	/* debug registers valid? */
 #define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
@@ -290,8 +173,6 @@ extern struct cpuinfo_ia64 {
 	__u64 ipi_count;
 	__u64 prof_counter;
 	__u64 prof_multiplier;
-	__u32 pfm_syst_wide;
-	__u32 pfm_dcr_pp;
 #endif
 } cpu_info __per_cpu_data;

@@ -632,14 +513,22 @@ ia64_invala (void)
 	asm volatile ("invala" ::: "memory");
 }

+static inline __u64
+ia64_clear_ic (void)
+{
+	__u64 psr;
+	asm volatile ("mov %0=psr;; rsm psr.i | psr.ic;; srlz.i;;" : "=r"(psr) :: "memory");
+	return psr;
+}
+
 /*
- * Save the processor status flags in FLAGS and then clear the interrupt collection and
- * interrupt enable bits.  Don't trigger any mandatory RSE references while this bit is
- * off!
+ * Restore the psr.
 */
-#define ia64_clear_ic(flags)						\
-	asm volatile ("mov %0=psr;; rsm psr.i | psr.ic;; srlz.i;;"	\
-			      : "=r"(flags) :: "memory");
+static inline void
+ia64_set_psr (__u64 psr)
+{
+	asm volatile (";; mov psr.l=%0;; srlz.d" :: "r" (psr) : "memory");
+}

 /*
 * Insert a translation into an instruction and/or data translation

--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -14,6 +14,7 @@
 */
 #include <linux/config.h>

+#include <asm/kregs.h>
 #include <asm/page.h>

 #define KERNEL_START		(PAGE_OFFSET + 68*1024*1024)
@@ -30,7 +31,7 @@ struct pci_vector_struct {
 	__u16 bus;	/* PCI Bus number */
 	__u32 pci_id;	/* ACPI split 16 bits device, 16 bits function (see section 6.1.1) */
 	__u8 pin;	/* PCI PIN (0 = A, 1 = B, 2 = C, 3 = D) */
-	__u8 irq;	/* IRQ assigned */
+	__u32 irq;	/* IRQ assigned */
 };

 extern struct ia64_boot_param {
@@ -135,16 +136,21 @@ do {											\
 	}										\
 } while (0)

-# define local_irq_restore(x)						 \
-do {									 \
-	unsigned long ip, old_psr, psr = (x);				 \
-									 \
-	__asm__ __volatile__ (";;mov %0=psr; mov psr.l=%1;; srlz.d"	 \
-			      : "=&r" (old_psr) : "r" (psr) : "memory"); \
-	if ((old_psr & (1UL << 14)) && !(psr & (1UL << 14))) {		 \
-		__asm__ ("mov %0=ip" : "=r"(ip));			 \
-		last_cli_ip = ip;					 \
-	}								 \
+# define local_irq_restore(x)							\
+do {										\
+	unsigned long ip, old_psr, psr = (x);					\
+										\
+	__asm__ __volatile__ ("mov %0=psr;"					\
+			      "cmp.ne p6,p7=%1,r0;;"				\
+			      "(p6) ssm psr.i;"					\
+			      "(p7) rsm psr.i;;"				\
+			      "srlz.d"						\
+			      : "=&r" (old_psr) : "r"((psr) & IA64_PSR_I)	\
+			      : "p6", "p7", "memory");				\
+	if ((old_psr & IA64_PSR_I) && !(psr & IA64_PSR_I)) {			\
+		__asm__ ("mov %0=ip" : "=r"(ip));				\
+		last_cli_ip = ip;						\
+	}									\
 } while (0)

 #else /* !CONFIG_IA64_DEBUG_IRQ */
@@ -153,8 +159,12 @@ do {									 \
 						      : "=r" (x) :: "memory")
 # define local_irq_disable()	__asm__ __volatile__ (";; rsm psr.i;;" ::: "memory")
 /* (potentially) setting psr.i requires data serialization: */
-# define local_irq_restore(x)	__asm__ __volatile__ (";; mov psr.l=%0;; srlz.d"	\
-						      :: "r" (x) : "memory")
+# define local_irq_restore(x)	__asm__ __volatile__ ("cmp.ne p6,p7=%0,r0;;"	\
+						      "(p6) ssm psr.i;"		\
+						      "(p7) rsm psr.i;;"	\
+						      "srlz.d"			\
+						      :: "r"((x) & IA64_PSR_I)	\
+						      : "p6", "p7", "memory")
 #endif /* !CONFIG_IA64_DEBUG_IRQ */

 #define local_irq_enable()	__asm__ __volatile__ (";; ssm psr.i;; srlz.d" ::: "memory")

--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -222,6 +222,7 @@
 #define __NR_futex			1230
 #define __NR_sched_setaffinity		1231
 #define __NR_sched_getaffinity		1232
+#define __NR_security			1233

 #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER)