Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux

Pull ARM64 updates from Catalin Marinas: - KGDB support for arm64 - PCI I/O space extended to 16M (in preparation of PCIe support patches) - Dropping ZONE_DMA32 in favour of ZONE_DMA (we only need one for the time being), together with swiotlb late initialisation to correctly setup the bounce buffer - DMA API cache maintenance support (not all ARMv8 platforms have hardware cache coherency) - Crypto extensions advertising via ELF_HWCAP2 for compat user space - Perf support for dwarf unwinding in compat mode - asm/tlb.h converted to the generic mmu_gather code - asm-generic rwsem implementation - Code clean-up * tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (42 commits) arm64: Remove pgprot_dmacoherent() arm64: Support DMA_ATTR_WRITE_COMBINE arm64: Implement custom mmap functions for dma mapping arm64: Fix __range_ok macro arm64: Fix duplicated Kconfig entries arm64: mm: Route pmd thp functions through pte equivalents arm64: rwsem: use asm-generic rwsem implementation asm-generic: rwsem: de-PPCify rwsem.h arm64: enable generic CPU feature modalias matching for this architecture arm64: smp: make local symbol static arm64: debug: make local symbols static ARM64: perf: support dwarf unwinding in compat mode ARM64: perf: add support for frame pointer unwinding in compat mode ARM64: perf: add support for perf registers API arm64: Add boot time configuration of Intermediate Physical Address size arm64: Do not synchronise I and D caches for special ptes arm64: Make DMA coherent and strongly ordered mappings not executable arm64: barriers: add dmb barrier arm64: topology: Implement basic CPU topology support arm64: advertise ARMv8 extensions to 32-bit compat ELF binaries ...

Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull ARM64 updates from Catalin Marinas: - KGDB support for arm64 - PCI I/O space extended to 16M (in preparation of PCIe support patches) - Dropping ZONE_DMA32 in favour of ZONE_DMA (we only need one for the time being), together with swiotlb late initialisation to correctly setup the bounce buffer - DMA API cache maintenance support (not all ARMv8 platforms have hardware cache coherency) - Crypto extensions advertising via ELF_HWCAP2 for compat user space - Perf support for dwarf unwinding in compat mode - asm/tlb.h converted to the generic mmu_gather code - asm-generic rwsem implementation - Code clean-up * tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (42 commits) arm64: Remove pgprot_dmacoherent() arm64: Support DMA_ATTR_WRITE_COMBINE arm64: Implement custom mmap functions for dma mapping arm64: Fix __range_ok macro arm64: Fix duplicated Kconfig entries arm64: mm: Route pmd thp functions through pte equivalents arm64: rwsem: use asm-generic rwsem implementation asm-generic: rwsem: de-PPCify rwsem.h arm64: enable generic CPU feature modalias matching for this architecture arm64: smp: make local symbol static arm64: debug: make local symbols static ARM64: perf: support dwarf unwinding in compat mode ARM64: perf: add support for frame pointer unwinding in compat mode ARM64: perf: add support for perf registers API arm64: Add boot time configuration of Intermediate Physical Address size arm64: Do not synchronise I and D caches for special ptes arm64: Make DMA coherent and strongly ordered mappings not executable arm64: barriers: add dmb barrier arm64: topology: Implement basic CPU topology support arm64: advertise ARMv8 extensions to 32-bit compat ELF binaries ...
1ce235fa · Linus Torvalds · e38be1b1 · 196adf2f · 1ce235fa · 1ce235fa
Commit 1ce235fa authored Mar 31, 2014 by Linus Torvalds
45 changed files
--- a/Documentation/arm64/memory.txt
+++ b/Documentation/arm64/memory.txt
@@ -35,11 +35,13 @@ ffffffbc00000000	ffffffbdffffffff	   8GB		vmemmap

 ffffffbe00000000	ffffffbffbbfffff	  ~8GB		[guard, future vmmemap]

-ffffffbffbc00000	ffffffbffbdfffff	   2MB		earlyprintk device
+ffffffbffa000000	ffffffbffaffffff	  16MB		PCI I/O space
+
+ffffffbffb000000	ffffffbffbbfffff	  12MB		[guard]

-ffffffbffbe00000	ffffffbffbe0ffff	  64KB		PCI I/O space
+ffffffbffbc00000	ffffffbffbdfffff	   2MB		earlyprintk device

-ffffffbffbe10000	ffffffbcffffffff	  ~2MB		[guard]
+ffffffbffbe00000	ffffffbffbffffff	   2MB		[guard]

 ffffffbffc000000	ffffffbfffffffff	  64MB		modules

@@ -60,11 +62,13 @@ fffffdfc00000000	fffffdfdffffffff	   8GB		vmemmap

 fffffdfe00000000	fffffdfffbbfffff	  ~8GB		[guard, future vmmemap]

-fffffdfffbc00000	fffffdfffbdfffff	   2MB		earlyprintk device
+fffffdfffa000000	fffffdfffaffffff	  16MB		PCI I/O space
+
+fffffdfffb000000	fffffdfffbbfffff	  12MB		[guard]

-fffffdfffbe00000	fffffdfffbe0ffff	  64KB		PCI I/O space
+fffffdfffbc00000	fffffdfffbdfffff	   2MB		earlyprintk device

-fffffdfffbe10000	fffffdfffbffffff	  ~2MB		[guard]
+fffffdfffbe00000	fffffdfffbffffff	   2MB		[guard]

 fffffdfffc000000	fffffdffffffffff	  64MB		modules


--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -16,6 +16,7 @@ config ARM64
 	select DCACHE_WORD_ACCESS
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
+	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_IOMAP
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW
@@ -26,6 +27,7 @@ config ARM64
 	select GENERIC_TIME_VSYSCALL
 	select HARDIRQS_SW_RESEND
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
@@ -38,6 +40,8 @@ config ARM64
 	select HAVE_MEMBLOCK
 	select HAVE_PATA_PLATFORM
 	select HAVE_PERF_EVENTS
+	select HAVE_PERF_REGS
+	select HAVE_PERF_USER_STACK_DUMP
 	select IRQ_DOMAIN
 	select MODULES_USE_ELF_RELA
 	select NO_BOOTMEM
@@ -73,7 +77,7 @@ config LOCKDEP_SUPPORT
 config TRACE_IRQFLAGS_SUPPORT
 	def_bool y

-config RWSEM_GENERIC_SPINLOCK
+config RWSEM_XCHGADD_ALGORITHM
 	def_bool y

 config GENERIC_HWEIGHT
@@ -85,7 +89,7 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
 	def_bool y

-config ZONE_DMA32
+config ZONE_DMA
 	def_bool y

 config ARCH_DMA_ADDR_T_64BIT
@@ -164,6 +168,22 @@ config SMP

 	  If you don't know what to do here, say N.

+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places. If unsure say N here.
+
+config SCHED_SMT
+	bool "SMT scheduler support"
+	depends on SMP
+	help
+	  Improves the CPU scheduler's decision making when dealing with
+	  MultiThreading at a cost of slightly increased overhead in some
+	  places. If unsure say N here.
+
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
@@ -301,6 +321,8 @@ menu "CPU Power Management"

 source "drivers/cpuidle/Kconfig"

+source "drivers/cpufreq/Kconfig"
+
 endmenu

 source "net/Kconfig"

--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -32,6 +32,7 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
+generic-y += rwsem.h
 generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += segment.h

--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -25,6 +25,7 @@
 #define wfi()		asm volatile("wfi" : : : "memory")

 #define isb()		asm volatile("isb" : : : "memory")
+#define dmb(opt)	asm volatile("dmb sy" : : : "memory")
 #define dsb(opt)	asm volatile("dsb sy" : : : "memory")

 #define mb()		dsb()

--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -84,6 +84,13 @@ static inline void flush_cache_page(struct vm_area_struct *vma,
 {
 }

+/*
+ * Cache maintenance functions used by the DMA API. No to be used directly.
+ */
+extern void __dma_map_area(const void *, size_t, int);
+extern void __dma_unmap_area(const void *, size_t, int);
+extern void __dma_flush_range(const void *, const void *);
+
 /*
 * Copy user data from/to a page which is mapped into a different
 * processes address space.  Really, we want to allow our "user

--- a/arch/arm64/include/asm/compat.h
+++ b/arch/arm64/include/asm/compat.h
@@ -228,7 +228,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr)
 	return (u32)(unsigned long)uptr;
 }

-#define compat_user_stack_pointer() (current_pt_regs()->compat_sp)
+#define compat_user_stack_pointer() (user_stack_pointer(current_pt_regs()))

 static inline void __user *arch_compat_alloc_user_space(long len)
 {

--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
+/*
+ * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __ASM_CPUFEATURE_H
+#define __ASM_CPUFEATURE_H
+
+#include <asm/hwcap.h>
+
+/*
+ * In the arm64 world (as in the ARM world), elf_hwcap is used both internally
+ * in the kernel and for user space to keep track of which optional features
+ * are supported by the current system. So let's map feature 'x' to HWCAP_x.
+ * Note that HWCAP_x constants are bit fields so we need to take the log.
+ */
+
+#define MAX_CPU_FEATURES	(8 * sizeof(elf_hwcap))
+#define cpu_feature(x)		ilog2(HWCAP_ ## x)
+
+static inline bool cpu_have_feature(unsigned int num)
+{
+	return elf_hwcap & (1UL << num);
+}
+
+#endif
--- a/arch/arm64/include/asm/debug-monitors.h
+++ b/arch/arm64/include/asm/debug-monitors.h
@@ -26,6 +26,53 @@
 #define DBG_ESR_EVT_HWWP	0x2
 #define DBG_ESR_EVT_BRK		0x6

+/*
+ * Break point instruction encoding
+ */
+#define BREAK_INSTR_SIZE		4
+
+/*
+ * ESR values expected for dynamic and compile time BRK instruction
+ */
+#define DBG_ESR_VAL_BRK(x)	(0xf2000000 | ((x) & 0xfffff))
+
+/*
+ * #imm16 values used for BRK instruction generation
+ * Allowed values for kgbd are 0x400 - 0x7ff
+ * 0x400: for dynamic BRK instruction
+ * 0x401: for compile time BRK instruction
+ */
+#define KGDB_DYN_DGB_BRK_IMM		0x400
+#define KDBG_COMPILED_DBG_BRK_IMM	0x401
+
+/*
+ * BRK instruction encoding
+ * The #imm16 value should be placed at bits[20:5] within BRK ins
+ */
+#define AARCH64_BREAK_MON	0xd4200000
+
+/*
+ * Extract byte from BRK instruction
+ */
+#define KGDB_DYN_DGB_BRK_INS_BYTE(x) \
+	((((AARCH64_BREAK_MON) & 0xffe0001f) >> (x * 8)) & 0xff)
+
+/*
+ * Extract byte from BRK #imm16
+ */
+#define KGBD_DYN_DGB_BRK_IMM_BYTE(x) \
+	(((((KGDB_DYN_DGB_BRK_IMM) & 0xffff) << 5) >> (x * 8)) & 0xff)
+
+#define KGDB_DYN_DGB_BRK_BYTE(x) \
+	(KGDB_DYN_DGB_BRK_INS_BYTE(x) | KGBD_DYN_DGB_BRK_IMM_BYTE(x))
+
+#define  KGDB_DYN_BRK_INS_BYTE0  KGDB_DYN_DGB_BRK_BYTE(0)
+#define  KGDB_DYN_BRK_INS_BYTE1  KGDB_DYN_DGB_BRK_BYTE(1)
+#define  KGDB_DYN_BRK_INS_BYTE2  KGDB_DYN_DGB_BRK_BYTE(2)
+#define  KGDB_DYN_BRK_INS_BYTE3  KGDB_DYN_DGB_BRK_BYTE(3)
+
+#define CACHE_FLUSH_IS_SAFE		1
+
 enum debug_el {
 	DBG_ACTIVE_EL0 = 0,
 	DBG_ACTIVE_EL1,
@@ -43,23 +90,6 @@ enum debug_el {
 #ifndef __ASSEMBLY__
 struct task_struct;

-#define local_dbg_save(flags)							\
-	do {									\
-		typecheck(unsigned long, flags);				\
-		asm volatile(							\
-		"mrs	%0, daif			// local_dbg_save\n"	\
-		"msr	daifset, #8"						\
-		: "=r" (flags) : : "memory");					\
-	} while (0)
-
-#define local_dbg_restore(flags)						\
-	do {									\
-		typecheck(unsigned long, flags);				\
-		asm volatile(							\
-		"msr	daif, %0			// local_dbg_restore\n"	\
-		: : "r" (flags) : "memory");					\
-	} while (0)
-
 #define DBG_ARCH_ID_RESERVED	0	/* In case of ptrace ABI updates. */

 #define DBG_HOOK_HANDLED	0

--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -30,6 +30,8 @@

 #define DMA_ERROR_CODE	(~(dma_addr_t)0)
 extern struct dma_map_ops *dma_ops;
+extern struct dma_map_ops coherent_swiotlb_dma_ops;
+extern struct dma_map_ops noncoherent_swiotlb_dma_ops;

 static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
@@ -47,6 +49,11 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 		return __generic_dma_ops(dev);
 }

+static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops)
+{
+	dev->archdata.dma_ops = ops;
+}
+
 #include <asm-generic/dma-mapping-common.h>

 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)

--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -32,6 +32,12 @@
 #define COMPAT_HWCAP_IDIV	(COMPAT_HWCAP_IDIVA|COMPAT_HWCAP_IDIVT)
 #define COMPAT_HWCAP_EVTSTRM	(1 << 21)

+#define COMPAT_HWCAP2_AES	(1 << 0)
+#define COMPAT_HWCAP2_PMULL	(1 << 1)
+#define COMPAT_HWCAP2_SHA1	(1 << 2)
+#define COMPAT_HWCAP2_SHA2	(1 << 3)
+#define COMPAT_HWCAP2_CRC32	(1 << 4)
+
 #ifndef __ASSEMBLY__
 /*
 * This yields a mask that user programs can use to figure out what
@@ -41,7 +47,8 @@

 #ifdef CONFIG_COMPAT
 #define COMPAT_ELF_HWCAP	(compat_elf_hwcap)
-extern unsigned int compat_elf_hwcap;
+#define COMPAT_ELF_HWCAP2	(compat_elf_hwcap2)
+extern unsigned int compat_elf_hwcap, compat_elf_hwcap2;
 #endif

 extern unsigned long elf_hwcap;

--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -121,7 +121,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
 *  I/O port access primitives.
 */
 #define IO_SPACE_LIMIT		0xffff
-#define PCI_IOBASE		((void __iomem *)(MODULES_VADDR - SZ_2M))
+#define PCI_IOBASE		((void __iomem *)(MODULES_VADDR - SZ_32M))

 static inline u8 inb(unsigned long addr)
 {

--- a/arch/arm64/include/asm/irqflags.h
+++ b/arch/arm64/include/asm/irqflags.h
@@ -90,5 +90,28 @@ static inline int arch_irqs_disabled_flags(unsigned long flags)
 	return flags & PSR_I_BIT;
 }

+/*
+ * save and restore debug state
+ */
+#define local_dbg_save(flags)						\
+	do {								\
+		typecheck(unsigned long, flags);			\
+		asm volatile(						\
+		"mrs    %0, daif		// local_dbg_save\n"	\
+		"msr    daifset, #8"					\
+		: "=r" (flags) : : "memory");				\
+	} while (0)
+
+#define local_dbg_restore(flags)					\
+	do {								\
+		typecheck(unsigned long, flags);			\
+		asm volatile(						\
+		"msr    daif, %0		// local_dbg_restore\n"	\
+		: : "r" (flags) : "memory");				\
+	} while (0)
+
+#define local_dbg_enable()	asm("msr	daifclr, #8" : : : "memory")
+#define local_dbg_disable()	asm("msr	daifset, #8" : : : "memory")
+
 #endif
 #endif
--- a/arch/arm64/include/asm/kgdb.h
+++ b/arch/arm64/include/asm/kgdb.h
+/*
+ * AArch64 KGDB support
+ *
+ * Based on arch/arm/include/kgdb.h
+ *
+ * Copyright (C) 2013 Cavium Inc.
+ * Author: Vijaya Kumar K <vijaya.kumar@caviumnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM_KGDB_H
+#define __ARM_KGDB_H
+
+#include <linux/ptrace.h>
+#include <asm/debug-monitors.h>
+
+#ifndef	__ASSEMBLY__
+
+static inline void arch_kgdb_breakpoint(void)
+{
+	asm ("brk %0" : : "I" (KDBG_COMPILED_DBG_BRK_IMM));
+}
+
+extern void kgdb_handle_bus_error(void);
+extern int kgdb_fault_expected;
+
+#endif /* !__ASSEMBLY__ */
+
+/*
+ * gdb is expecting the following registers layout.
+ *
+ * General purpose regs:
+ *     r0-r30: 64 bit
+ *     sp,pc : 64 bit
+ *     pstate  : 64 bit
+ *     Total: 34
+ * FPU regs:
+ *     f0-f31: 128 bit
+ *     Total: 32
+ * Extra regs
+ *     fpsr & fpcr: 32 bit
+ *     Total: 2
+ *
+ */
+
+#define _GP_REGS		34
+#define _FP_REGS		32
+#define _EXTRA_REGS		2
+/*
+ * general purpose registers size in bytes.
+ * pstate is only 4 bytes. subtract 4 bytes
+ */
+#define GP_REG_BYTES		(_GP_REGS * 8)
+#define DBG_MAX_REG_NUM		(_GP_REGS + _FP_REGS + _EXTRA_REGS)
+
+/*
+ * Size of I/O buffer for gdb packet.
+ * considering to hold all register contents, size is set
+ */
+
+#define BUFMAX			2048
+
+/*
+ * Number of bytes required for gdb_regs buffer.
+ * _GP_REGS: 8 bytes, _FP_REGS: 16 bytes and _EXTRA_REGS: 4 bytes each
+ * GDB fails to connect for size beyond this with error
+ * "'g' packet reply is too long"
+ */
+
+#define NUMREGBYTES	((_GP_REGS * 8) + (_FP_REGS * 16) + \
+			(_EXTRA_REGS * 4))
+
+#endif /* __ASM_KGDB_H */
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -106,7 +106,6 @@

 /* VTCR_EL2 Registers bits */
 #define VTCR_EL2_PS_MASK	(7 << 16)
-#define VTCR_EL2_PS_40B		(2 << 16)
 #define VTCR_EL2_TG0_MASK	(1 << 14)
 #define VTCR_EL2_TG0_4K		(0 << 14)
 #define VTCR_EL2_TG0_64K	(1 << 14)
@@ -129,10 +128,9 @@
 * 64kB pages (TG0 = 1)
 * 2 level page tables (SL = 1)
 */
-#define VTCR_EL2_FLAGS		(VTCR_EL2_PS_40B | VTCR_EL2_TG0_64K | \
-				 VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
-				 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_SL0_LVL1 | \
-				 VTCR_EL2_T0SZ_40B)
+#define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_64K | VTCR_EL2_SH0_INNER | \
+				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
+				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
 #define VTTBR_X		(38 - VTCR_EL2_T0SZ_40B)
 #else
 /*
@@ -142,10 +140,9 @@
 * 4kB pages (TG0 = 0)
 * 3 level page tables (SL = 1)
 */
-#define VTCR_EL2_FLAGS		(VTCR_EL2_PS_40B | VTCR_EL2_TG0_4K | \
-				 VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
-				 VTCR_EL2_IRGN0_WBWA | VTCR_EL2_SL0_LVL1 | \
-				 VTCR_EL2_T0SZ_40B)
+#define VTCR_EL2_FLAGS		(VTCR_EL2_TG0_4K | VTCR_EL2_SH0_INNER | \
+				 VTCR_EL2_ORGN0_WBWA | VTCR_EL2_IRGN0_WBWA | \
+				 VTCR_EL2_SL0_LVL1 | VTCR_EL2_T0SZ_40B)
 #define VTTBR_X		(37 - VTCR_EL2_T0SZ_40B)
 #endif


--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -100,9 +100,9 @@
 #define PTE_HYP			PTE_USER

 /*
- * 40-bit physical address supported.
+ * Highest possible physical address supported.
 */
-#define PHYS_MASK_SHIFT		(40)
+#define PHYS_MASK_SHIFT		(48)
 #define PHYS_MASK		((UL(1) << PHYS_MASK_SHIFT) - 1)

 /*
@@ -122,7 +122,6 @@
 #define TCR_SHARED		((UL(3) << 12) | (UL(3) << 28))
 #define TCR_TG0_64K		(UL(1) << 14)
 #define TCR_TG1_64K		(UL(1) << 30)
-#define TCR_IPS_40BIT		(UL(2) << 32)
 #define TCR_ASID16		(UL(1) << 36)
 #define TCR_TBI0		(UL(1) << 37)


--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -199,7 +199,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep, pte_t pte)
 {
 	if (pte_valid_user(pte)) {
-		if (pte_exec(pte))
+		if (!pte_special(pte) && pte_exec(pte))
 			__sync_icache_dcache(pte, addr);
 		if (pte_dirty(pte) && pte_write(pte))
 			pte_val(pte) &= ~PTE_RDONLY;
@@ -227,36 +227,36 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,

 #define __HAVE_ARCH_PTE_SPECIAL

-/*
- * Software PMD bits for THP
- */
+static inline pte_t pmd_pte(pmd_t pmd)
+{
+	return __pte(pmd_val(pmd));
+}

-#define PMD_SECT_DIRTY		(_AT(pmdval_t, 1) << 55)
-#define PMD_SECT_SPLITTING	(_AT(pmdval_t, 1) << 57)
+static inline pmd_t pte_pmd(pte_t pte)
+{
+	return __pmd(pte_val(pte));
+}

 /*
 * THP definitions.
 */
-#define pmd_young(pmd)		(pmd_val(pmd) & PMD_SECT_AF)
-
-#define __HAVE_ARCH_PMD_WRITE
-#define pmd_write(pmd)		(!(pmd_val(pmd) & PMD_SECT_RDONLY))

 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_trans_huge(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
+#define pmd_trans_splitting(pmd)	pte_special(pmd_pte(pmd))
 #endif

-#define PMD_BIT_FUNC(fn,op) \
-static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
+#define pmd_young(pmd)		pte_young(pmd_pte(pmd))
+#define pmd_wrprotect(pmd)	pte_pmd(pte_wrprotect(pmd_pte(pmd)))
+#define pmd_mksplitting(pmd)	pte_pmd(pte_mkspecial(pmd_pte(pmd)))
+#define pmd_mkold(pmd)		pte_pmd(pte_mkold(pmd_pte(pmd)))
+#define pmd_mkwrite(pmd)	pte_pmd(pte_mkwrite(pmd_pte(pmd)))
+#define pmd_mkdirty(pmd)	pte_pmd(pte_mkdirty(pmd_pte(pmd)))
+#define pmd_mkyoung(pmd)	pte_pmd(pte_mkyoung(pmd_pte(pmd)))
+#define pmd_mknotpresent(pmd)	(__pmd(pmd_val(pmd) &= ~PMD_TYPE_MASK))

-PMD_BIT_FUNC(wrprotect,	|= PMD_SECT_RDONLY);
-PMD_BIT_FUNC(mkold,	&= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= PMD_SECT_SPLITTING);
-PMD_BIT_FUNC(mkwrite,   &= ~PMD_SECT_RDONLY);
-PMD_BIT_FUNC(mkdirty,   |= PMD_SECT_DIRTY);
-PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
-PMD_BIT_FUNC(mknotpresent, &= ~PMD_TYPE_MASK);
+#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write(pmd)		pte_write(pmd_pte(pmd))

 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))

@@ -266,15 +266,6 @@ PMD_BIT_FUNC(mknotpresent, &= ~PMD_TYPE_MASK);

 #define pmd_page(pmd)           pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))

-static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
-{
-	const pmdval_t mask = PMD_SECT_USER | PMD_SECT_PXN | PMD_SECT_UXN |
-			      PMD_SECT_RDONLY | PMD_SECT_PROT_NONE |
-			      PMD_SECT_VALID;
-	pmd_val(pmd) = (pmd_val(pmd) & ~mask) | (pgprot_val(newprot) & mask);
-	return pmd;
-}
-
 #define set_pmd_at(mm, addr, pmdp, pmd)	set_pmd(pmdp, pmd)

 static inline int has_transparent_hugepage(void)
@@ -286,11 +277,9 @@ static inline int has_transparent_hugepage(void)
 * Mark the prot value as uncacheable and unbufferable.
 */
 #define pgprot_noncached(prot) \
-	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE))
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN)
 #define pgprot_writecombine(prot) \
-	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC))
-#define pgprot_dmacoherent(prot) \
-	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC))
+	__pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 struct file;
 extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
@@ -383,6 +372,11 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	return pte;
 }

+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
+}
+
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern pgd_t idmap_pg_dir[PTRS_PER_PGD];


--- a/arch/arm64/include/asm/psci.h
+++ b/arch/arm64/include/asm/psci.h
@@ -14,6 +14,6 @@
 #ifndef __ASM_PSCI_H
 #define __ASM_PSCI_H

-int psci_init(void);
+void psci_init(void);

 #endif /* __ASM_PSCI_H */
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -68,6 +68,7 @@

 /* Architecturally defined mapping between AArch32 and AArch64 registers */
 #define compat_usr(x)	regs[(x)]
+#define compat_fp	regs[11]
 #define compat_sp	regs[13]
 #define compat_lr	regs[14]
 #define compat_sp_hyp	regs[15]
@@ -132,7 +133,7 @@ struct pt_regs {
 	(!((regs)->pstate & PSR_F_BIT))

 #define user_stack_pointer(regs) \
-	((regs)->sp)
+	(!compat_user_mode(regs)) ? ((regs)->sp) : ((regs)->compat_sp)

 /*
 * Are the current registers suitable for user mode? (used to maintain
@@ -164,7 +165,7 @@ static inline int valid_user_regs(struct user_pt_regs *regs)
 	return 0;
 }

-#define instruction_pointer(regs)	(regs)->pc
+#define instruction_pointer(regs)	((unsigned long)(regs)->pc)

 #ifdef CONFIG_SMP
 extern unsigned long profile_pc(struct pt_regs *regs);

--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -19,115 +19,44 @@
 #ifndef __ASM_TLB_H
 #define __ASM_TLB_H

-#include <linux/pagemap.h>
-#include <linux/swap.h>

-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-
-#define MMU_GATHER_BUNDLE	8
-
-/*
- * TLB handling.  This allows us to remove pages from the page
- * tables, and efficiently handle the TLB issues.
- */
-struct mmu_gather {
-	struct mm_struct	*mm;
-	unsigned int		fullmm;
-	struct vm_area_struct	*vma;
-	unsigned long		start, end;
-	unsigned long		range_start;
-	unsigned long		range_end;
-	unsigned int		nr;
-	unsigned int		max;
-	struct page		**pages;
-	struct page		*local[MMU_GATHER_BUNDLE];
-};
+#include <asm-generic/tlb.h>

 /*
- * This is unnecessarily complex.  There's three ways the TLB shootdown
- * code is used:
+ * There's three ways the TLB shootdown code is used:
 *  1. Unmapping a range of vmas.  See zap_page_range(), unmap_region().
 *     tlb->fullmm = 0, and tlb_start_vma/tlb_end_vma will be called.
- *     tlb->vma will be non-NULL.
 *  2. Unmapping all vmas.  See exit_mmap().
 *     tlb->fullmm = 1, and tlb_start_vma/tlb_end_vma will be called.
- *     tlb->vma will be non-NULL.  Additionally, page tables will be freed.
+ *     Page tables will be freed.
 *  3. Unmapping argument pages.  See shift_arg_pages().
 *     tlb->fullmm = 0, but tlb_start_vma/tlb_end_vma will not be called.
- *     tlb->vma will be NULL.
 */
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
-	if (tlb->fullmm || !tlb->vma)
+	if (tlb->fullmm) {
 		flush_tlb_mm(tlb->mm);
-	else if (tlb->range_end > 0) {
-		flush_tlb_range(tlb->vma, tlb->range_start, tlb->range_end);
-		tlb->range_start = TASK_SIZE;
-		tlb->range_end = 0;
+	} else if (tlb->end > 0) {
+		struct vm_area_struct vma = { .vm_mm = tlb->mm, };
+		flush_tlb_range(&vma, tlb->start, tlb->end);
+		tlb->start = TASK_SIZE;
+		tlb->end = 0;
 	}
 }

 static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr)
 {
 	if (!tlb->fullmm) {
-		if (addr < tlb->range_start)
-			tlb->range_start = addr;
-		if (addr + PAGE_SIZE > tlb->range_end)
-			tlb->range_end = addr + PAGE_SIZE;
-	}
-}
-
-static inline void __tlb_alloc_page(struct mmu_gather *tlb)
-{
-	unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
-
-	if (addr) {
-		tlb->pages = (void *)addr;
-		tlb->max = PAGE_SIZE / sizeof(struct page *);
+		tlb->start = min(tlb->start, addr);
+		tlb->end = max(tlb->end, addr + PAGE_SIZE);
 	}
 }

-static inline void tlb_flush_mmu(struct mmu_gather *tlb)
-{
-	tlb_flush(tlb);
-	free_pages_and_swap_cache(tlb->pages, tlb->nr);
-	tlb->nr = 0;
-	if (tlb->pages == tlb->local)
-		__tlb_alloc_page(tlb);
-}
-
-static inline void
-tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end)
-{
-	tlb->mm = mm;
-	tlb->fullmm = !(start | (end+1));
-	tlb->start = start;
-	tlb->end = end;
-	tlb->vma = NULL;
-	tlb->max = ARRAY_SIZE(tlb->local);
-	tlb->pages = tlb->local;
-	tlb->nr = 0;
-	__tlb_alloc_page(tlb);
-}
-
-static inline void
-tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
-{
-	tlb_flush_mmu(tlb);
-
-	/* keep the page table cache within bounds */
-	check_pgt_cache();
-
-	if (tlb->pages != tlb->local)
-		free_pages((unsigned long)tlb->pages, 0);
-}
-
 /*
 * Memorize the range for the TLB flush.
 */
-static inline void
-tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
+static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
+					  unsigned long addr)
 {
 	tlb_add_flush(tlb, addr);
 }
@@ -137,38 +66,24 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
-static inline void
-tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+static inline void tlb_start_vma(struct mmu_gather *tlb,
+				 struct vm_area_struct *vma)
 {
 	if (!tlb->fullmm) {
-		tlb->vma = vma;
-		tlb->range_start = TASK_SIZE;
-		tlb->range_end = 0;
+		tlb->start = TASK_SIZE;
+		tlb->end = 0;
 	}
 }

-static inline void
-tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
+static inline void tlb_end_vma(struct mmu_gather *tlb,
+			       struct vm_area_struct *vma)
 {
 	if (!tlb->fullmm)
 		tlb_flush(tlb);
 }

-static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
-	tlb->pages[tlb->nr++] = page;
-	VM_BUG_ON(tlb->nr > tlb->max);
-	return tlb->max - tlb->nr;
-}
-
-static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
-{
-	if (!__tlb_remove_page(tlb, page))
-		tlb_flush_mmu(tlb);
-}
-
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
-	unsigned long addr)
+				  unsigned long addr)
 {
 	pgtable_page_dtor(pte);
 	tlb_add_flush(tlb, addr);
@@ -184,16 +99,5 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
 }
 #endif

-#define pte_free_tlb(tlb, ptep, addr)	__pte_free_tlb(tlb, ptep, addr)
-#define pmd_free_tlb(tlb, pmdp, addr)	__pmd_free_tlb(tlb, pmdp, addr)
-#define pud_free_tlb(tlb, pudp, addr)	pud_free((tlb)->mm, pudp)
-
-#define tlb_migrate_finish(mm)		do { } while (0)
-
-static inline void
-tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr)
-{
-	tlb_add_flush(tlb, addr);
-}

 #endif
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
+#ifndef __ASM_TOPOLOGY_H
+#define __ASM_TOPOLOGY_H
+
+#ifdef CONFIG_SMP
+
+#include <linux/cpumask.h>
+
+struct cpu_topology {
+	int thread_id;
+	int core_id;
+	int cluster_id;
+	cpumask_t thread_sibling;
+	cpumask_t core_sibling;
+};
+
+extern struct cpu_topology cpu_topology[NR_CPUS];
+
+#define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
+#define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
+#define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
+#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+
+#define mc_capable()	(cpu_topology[0].cluster_id != -1)
+#define smt_capable()	(cpu_topology[0].thread_id != -1)
+
+void init_cpu_topology(void);
+void store_cpu_topology(unsigned int cpuid);
+const struct cpumask *cpu_coregroup_mask(int cpu);
+
+#else
+
+static inline void init_cpu_topology(void) { }
+static inline void store_cpu_topology(unsigned int cpuid) { }
+
+#endif
+
+#include <asm-generic/topology.h>
+
+#endif /* _ASM_ARM_TOPOLOGY_H */
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -83,7 +83,7 @@ static inline void set_fs(mm_segment_t fs)
 * Returns 1 if the range is valid, 0 otherwise.
 *
 * This is equivalent to the following test:
- * (u65)addr + (u65)size < (u65)current->addr_limit
+ * (u65)addr + (u65)size <= current->addr_limit
 *
 * This needs 65-bit arithmetic.
 */
@@ -91,7 +91,7 @@ static inline void set_fs(mm_segment_t fs)
 ({									\
 	unsigned long flag, roksum;					\
 	__chk_user_ptr(addr);						\
-	asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, cc"		\
+	asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls"		\
 		: "=&r" (flag), "=&r" (roksum)				\
 		: "1" (addr), "Ir" (size),				\
 		  "r" (current_thread_info()->addr_limit)		\

--- a/arch/arm64/include/uapi/asm/Kbuild
+++ b/arch/arm64/include/uapi/asm/Kbuild
@@ -9,6 +9,7 @@ header-y += byteorder.h
 header-y += fcntl.h
 header-y += hwcap.h
 header-y += kvm_para.h
+header-y += perf_regs.h
 header-y += param.h
 header-y += ptrace.h
 header-y += setup.h

--- a/arch/arm64/include/uapi/asm/perf_regs.h
+++ b/arch/arm64/include/uapi/asm/perf_regs.h
+#ifndef _ASM_ARM64_PERF_REGS_H
+#define _ASM_ARM64_PERF_REGS_H
+
+enum perf_event_arm_regs {
+	PERF_REG_ARM64_X0,
+	PERF_REG_ARM64_X1,
+	PERF_REG_ARM64_X2,
+	PERF_REG_ARM64_X3,
+	PERF_REG_ARM64_X4,
+	PERF_REG_ARM64_X5,
+	PERF_REG_ARM64_X6,
+	PERF_REG_ARM64_X7,
+	PERF_REG_ARM64_X8,
+	PERF_REG_ARM64_X9,
+	PERF_REG_ARM64_X10,
+	PERF_REG_ARM64_X11,
+	PERF_REG_ARM64_X12,
+	PERF_REG_ARM64_X13,
+	PERF_REG_ARM64_X14,
+	PERF_REG_ARM64_X15,
+	PERF_REG_ARM64_X16,
+	PERF_REG_ARM64_X17,
+	PERF_REG_ARM64_X18,
+	PERF_REG_ARM64_X19,
+	PERF_REG_ARM64_X20,
+	PERF_REG_ARM64_X21,
+	PERF_REG_ARM64_X22,
+	PERF_REG_ARM64_X23,
+	PERF_REG_ARM64_X24,
+	PERF_REG_ARM64_X25,
+	PERF_REG_ARM64_X26,
+	PERF_REG_ARM64_X27,
+	PERF_REG_ARM64_X28,
+	PERF_REG_ARM64_X29,
+	PERF_REG_ARM64_LR,
+	PERF_REG_ARM64_SP,
+	PERF_REG_ARM64_PC,
+	PERF_REG_ARM64_MAX,
+};
+#endif /* _ASM_ARM64_PERF_REGS_H */
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,12 +14,14 @@ arm64-obj-y		:= cputable.o debug-monitors.o entry.o irq.o fpsimd.o	\
 arm64-obj-$(CONFIG_COMPAT)		+= sys32.o kuser32.o signal32.o 	\
 					   sys_compat.o
 arm64-obj-$(CONFIG_MODULES)		+= arm64ksyms.o module.o
-arm64-obj-$(CONFIG_SMP)			+= smp.o smp_spin_table.o
+arm64-obj-$(CONFIG_SMP)			+= smp.o smp_spin_table.o topology.o
+arm64-obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 arm64-obj-$(CONFIG_HW_PERF_EVENTS)	+= perf_event.o
-arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)+= hw_breakpoint.o
+arm64-obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 arm64-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 arm64-obj-$(CONFIG_ARM64_CPU_SUSPEND)	+= sleep.o suspend.o
 arm64-obj-$(CONFIG_JUMP_LABEL)		+= jump_label.o
+arm64-obj-$(CONFIG_KGDB)		+= kgdb.o

 obj-y					+= $(arm64-obj-y) vdso/
 obj-m					+= $(arm64-obj-m)

--- a/arch/arm64/kernel/debug-monitors.c
+++ b/arch/arm64/kernel/debug-monitors.c
@@ -137,7 +137,6 @@ void disable_debug_monitors(enum debug_el el)
 static void clear_os_lock(void *unused)
 {
 	asm volatile("msr oslar_el1, %0" : : "r" (0));
-	isb();
 }

 static int os_lock_notify(struct notifier_block *self,
@@ -156,8 +155,9 @@ static struct notifier_block os_lock_nb = {
 static int debug_monitors_init(void)
 {
 	/* Clear the OS lock. */
-	smp_call_function(clear_os_lock, NULL, 1);
-	clear_os_lock(NULL);
+	on_each_cpu(clear_os_lock, NULL, 1);
+	isb();
+	local_dbg_enable();

 	/* Register hotplug handler. */
 	register_cpu_notifier(&os_lock_nb);
@@ -189,7 +189,7 @@ static void clear_regs_spsr_ss(struct pt_regs *regs)

 /* EL1 Single Step Handler hooks */
 static LIST_HEAD(step_hook);
-DEFINE_RWLOCK(step_hook_lock);
+static DEFINE_RWLOCK(step_hook_lock);

 void register_step_hook(struct step_hook *hook)
 {
@@ -276,7 +276,7 @@ static int single_step_handler(unsigned long addr, unsigned int esr,
 * Use reader/writer locks instead of plain spinlock.
 */
 static LIST_HEAD(break_hook);
-DEFINE_RWLOCK(break_hook_lock);
+static DEFINE_RWLOCK(break_hook_lock);

 void register_break_hook(struct break_hook *hook)
 {

--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -384,26 +384,18 @@ ENDPROC(__calc_phys_offset)
 * Preserves:	tbl, flags
 * Corrupts:	phys, start, end, pstate
 */
-	.macro	create_block_map, tbl, flags, phys, start, end, idmap=0
+	.macro	create_block_map, tbl, flags, phys, start, end
 	lsr	\phys, \phys, #BLOCK_SHIFT
-	.if	\idmap
-	and	\start, \phys, #PTRS_PER_PTE - 1	// table index
-	.else
 	lsr	\start, \start, #BLOCK_SHIFT
 	and	\start, \start, #PTRS_PER_PTE - 1	// table index
-	.endif
 	orr	\phys, \flags, \phys, lsl #BLOCK_SHIFT	// table entry
-	.ifnc	\start,\end
 	lsr	\end, \end, #BLOCK_SHIFT
 	and	\end, \end, #PTRS_PER_PTE - 1		// table end index
-	.endif
 9999:	str	\phys, [\tbl, \start, lsl #3]		// store the entry
-	.ifnc	\start,\end
 	add	\start, \start, #1			// next entry
 	add	\phys, \phys, #BLOCK_SIZE		// next block
 	cmp	\start, \end
 	b.ls	9999b
-	.endif
 	.endm

 /*
@@ -435,9 +427,13 @@ __create_page_tables:
 	 * Create the identity mapping.
 	 */
 	add	x0, x25, #PAGE_SIZE		// section table address
-	adr	x3, __turn_mmu_on		// virtual/physical address
+	ldr	x3, =KERNEL_START
+	add	x3, x3, x28			// __pa(KERNEL_START)
 	create_pgd_entry x25, x0, x3, x5, x6
-	create_block_map x0, x7, x3, x5, x5, idmap=1
+	ldr	x6, =KERNEL_END
+	mov	x5, x3				// __pa(KERNEL_START)
+	add	x6, x6, x28			// __pa(KERNEL_END)
+	create_block_map x0, x7, x3, x5, x6

 	/*
 	 * Map the kernel image (starting with PHYS_OFFSET).
@@ -445,7 +441,7 @@ __create_page_tables:
 	add	x0, x26, #PAGE_SIZE		// section table address
 	mov	x5, #PAGE_OFFSET
 	create_pgd_entry x26, x0, x5, x3, x6
-	ldr	x6, =KERNEL_END - 1
+	ldr	x6, =KERNEL_END
 	mov	x3, x24				// phys offset
 	create_block_map x0, x7, x3, x5, x6


--- a/arch/arm64/kernel/kgdb.c
+++ b/arch/arm64/kernel/kgdb.c
+/*
+ * AArch64 KGDB support
+ *
+ * Based on arch/arm/kernel/kgdb.c
+ *
+ * Copyright (C) 2013 Cavium Inc.
+ * Author: Vijaya Kumar K <vijaya.kumar@caviumnetworks.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/irq.h>
+#include <linux/kdebug.h>
+#include <linux/kgdb.h>
+#include <asm/traps.h>
+
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = {
+	{ "x0", 8, offsetof(struct pt_regs, regs[0])},
+	{ "x1", 8, offsetof(struct pt_regs, regs[1])},
+	{ "x2", 8, offsetof(struct pt_regs, regs[2])},
+	{ "x3", 8, offsetof(struct pt_regs, regs[3])},
+	{ "x4", 8, offsetof(struct pt_regs, regs[4])},
+	{ "x5", 8, offsetof(struct pt_regs, regs[5])},
+	{ "x6", 8, offsetof(struct pt_regs, regs[6])},
+	{ "x7", 8, offsetof(struct pt_regs, regs[7])},
+	{ "x8", 8, offsetof(struct pt_regs, regs[8])},
+	{ "x9", 8, offsetof(struct pt_regs, regs[9])},
+	{ "x10", 8, offsetof(struct pt_regs, regs[10])},
+	{ "x11", 8, offsetof(struct pt_regs, regs[11])},
+	{ "x12", 8, offsetof(struct pt_regs, regs[12])},
+	{ "x13", 8, offsetof(struct pt_regs, regs[13])},
+	{ "x14", 8, offsetof(struct pt_regs, regs[14])},
+	{ "x15", 8, offsetof(struct pt_regs, regs[15])},
+	{ "x16", 8, offsetof(struct pt_regs, regs[16])},
+	{ "x17", 8, offsetof(struct pt_regs, regs[17])},
+	{ "x18", 8, offsetof(struct pt_regs, regs[18])},
+	{ "x19", 8, offsetof(struct pt_regs, regs[19])},
+	{ "x20", 8, offsetof(struct pt_regs, regs[20])},
+	{ "x21", 8, offsetof(struct pt_regs, regs[21])},
+	{ "x22", 8, offsetof(struct pt_regs, regs[22])},
+	{ "x23", 8, offsetof(struct pt_regs, regs[23])},
+	{ "x24", 8, offsetof(struct pt_regs, regs[24])},
+	{ "x25", 8, offsetof(struct pt_regs, regs[25])},
+	{ "x26", 8, offsetof(struct pt_regs, regs[26])},
+	{ "x27", 8, offsetof(struct pt_regs, regs[27])},
+	{ "x28", 8, offsetof(struct pt_regs, regs[28])},
+	{ "x29", 8, offsetof(struct pt_regs, regs[29])},
+	{ "x30", 8, offsetof(struct pt_regs, regs[30])},
+	{ "sp", 8, offsetof(struct pt_regs, sp)},
+	{ "pc", 8, offsetof(struct pt_regs, pc)},
+	{ "pstate", 8, offsetof(struct pt_regs, pstate)},
+	{ "v0", 16, -1 },
+	{ "v1", 16, -1 },
+	{ "v2", 16, -1 },
+	{ "v3", 16, -1 },
+	{ "v4", 16, -1 },
+	{ "v5", 16, -1 },
+	{ "v6", 16, -1 },
+	{ "v7", 16, -1 },
+	{ "v8", 16, -1 },
+	{ "v9", 16, -1 },
+	{ "v10", 16, -1 },
+	{ "v11", 16, -1 },
+	{ "v12", 16, -1 },
+	{ "v13", 16, -1 },
+	{ "v14", 16, -1 },
+	{ "v15", 16, -1 },
+	{ "v16", 16, -1 },
+	{ "v17", 16, -1 },
+	{ "v18", 16, -1 },
+	{ "v19", 16, -1 },
+	{ "v20", 16, -1 },
+	{ "v21", 16, -1 },
+	{ "v22", 16, -1 },
+	{ "v23", 16, -1 },
+	{ "v24", 16, -1 },
+	{ "v25", 16, -1 },
+	{ "v26", 16, -1 },
+	{ "v27", 16, -1 },
+	{ "v28", 16, -1 },
+	{ "v29", 16, -1 },
+	{ "v30", 16, -1 },
+	{ "v31", 16, -1 },
+	{ "fpsr", 4, -1 },
+	{ "fpcr", 4, -1 },
+};
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return NULL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+		       dbg_reg_def[regno].size);
+	else
+		memset(mem, 0, dbg_reg_def[regno].size);
+	return dbg_reg_def[regno].name;
+}
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+	if (regno >= DBG_MAX_REG_NUM || regno < 0)
+		return -EINVAL;
+
+	if (dbg_reg_def[regno].offset != -1)
+		memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+		       dbg_reg_def[regno].size);
+	return 0;
+}
+
+void
+sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *task)
+{
+	struct pt_regs *thread_regs;
+
+	/* Initialize to zero */
+	memset((char *)gdb_regs, 0, NUMREGBYTES);
+	thread_regs = task_pt_regs(task);
+	memcpy((void *)gdb_regs, (void *)thread_regs->regs, GP_REG_BYTES);
+}
+
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long pc)
+{
+	regs->pc = pc;
+}
+
+static int compiled_break;
+
+static void kgdb_arch_update_addr(struct pt_regs *regs,
+				char *remcom_in_buffer)
+{
+	unsigned long addr;
+	char *ptr;
+
+	ptr = &remcom_in_buffer[1];
+	if (kgdb_hex2long(&ptr, &addr))
+		kgdb_arch_set_pc(regs, addr);
+	else if (compiled_break == 1)
+		kgdb_arch_set_pc(regs, regs->pc + 4);
+
+	compiled_break = 0;
+}
+
+int kgdb_arch_handle_exception(int exception_vector, int signo,
+			       int err_code, char *remcom_in_buffer,
+			       char *remcom_out_buffer,
+			       struct pt_regs *linux_regs)
+{
+	int err;
+
+	switch (remcom_in_buffer[0]) {
+	case 'D':
+	case 'k':
+		/*
+		 * Packet D (Detach), k (kill). No special handling
+		 * is required here. Handle same as c packet.
+		 */
+	case 'c':
+		/*
+		 * Packet c (Continue) to continue executing.
+		 * Set pc to required address.
+		 * Try to read optional parameter and set pc.
+		 * If this was a compiled breakpoint, we need to move
+		 * to the next instruction else we will just breakpoint
+		 * over and over again.
+		 */
+		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
+		atomic_set(&kgdb_cpu_doing_single_step, -1);
+		kgdb_single_step =  0;
+
+		/*
+		 * Received continue command, disable single step
+		 */
+		if (kernel_active_single_step())
+			kernel_disable_single_step();
+
+		err = 0;
+		break;
+	case 's':
+		/*
+		 * Update step address value with address passed
+		 * with step packet.
+		 * On debug exception return PC is copied to ELR
+		 * So just update PC.
+		 * If no step address is passed, resume from the address
+		 * pointed by PC. Do not update PC
+		 */
+		kgdb_arch_update_addr(linux_regs, remcom_in_buffer);
+		atomic_set(&kgdb_cpu_doing_single_step, raw_smp_processor_id());
+		kgdb_single_step =  1;
+
+		/*
+		 * Enable single step handling
+		 */
+		if (!kernel_active_single_step())
+			kernel_enable_single_step(linux_regs);
+		err = 0;
+		break;
+	default:
+		err = -1;
+	}
+	return err;
+}
+
+static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+	return 0;
+}
+
+static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	compiled_break = 1;
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+
+	return 0;
+}
+
+static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr)
+{
+	kgdb_handle_exception(1, SIGTRAP, 0, regs);
+	return 0;
+}
+
+static struct break_hook kgdb_brkpt_hook = {
+	.esr_mask	= 0xffffffff,
+	.esr_val	= DBG_ESR_VAL_BRK(KGDB_DYN_DGB_BRK_IMM),
+	.fn		= kgdb_brk_fn
+};
+
+static struct break_hook kgdb_compiled_brkpt_hook = {
+	.esr_mask	= 0xffffffff,
+	.esr_val	= DBG_ESR_VAL_BRK(KDBG_COMPILED_DBG_BRK_IMM),
+	.fn		= kgdb_compiled_brk_fn
+};
+
+static struct step_hook kgdb_step_hook = {
+	.fn		= kgdb_step_brk_fn
+};
+
+static void kgdb_call_nmi_hook(void *ignored)
+{
+	kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs());
+}
+
+void kgdb_roundup_cpus(unsigned long flags)
+{
+	local_irq_enable();
+	smp_call_function(kgdb_call_nmi_hook, NULL, 0);
+	local_irq_disable();
+}
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+	struct pt_regs *regs = args->regs;
+
+	if (kgdb_handle_exception(1, args->signr, cmd, regs))
+		return NOTIFY_DONE;
+	return NOTIFY_STOP;
+}
+
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+	ret = __kgdb_notify(ptr, cmd);
+	local_irq_restore(flags);
+
+	return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+	.notifier_call	= kgdb_notify,
+	/*
+	 * Want to be lowest priority
+	 */
+	.priority	= -INT_MAX,
+};
+
+/*
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+	int ret = register_die_notifier(&kgdb_notifier);
+
+	if (ret != 0)
+		return ret;
+
+	register_break_hook(&kgdb_brkpt_hook);
+	register_break_hook(&kgdb_compiled_brkpt_hook);
+	register_step_hook(&kgdb_step_hook);
+	return 0;
+}
+
+/*
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+	unregister_break_hook(&kgdb_brkpt_hook);
+	unregister_break_hook(&kgdb_compiled_brkpt_hook);
+	unregister_step_hook(&kgdb_step_hook);
+	unregister_die_notifier(&kgdb_notifier);
+}
+
+/*
+ * ARM instructions are always in LE.
+ * Break instruction is encoded in LE format
+ */
+struct kgdb_arch arch_kgdb_ops = {
+	.gdb_bpt_instr = {
+		KGDB_DYN_BRK_INS_BYTE0,
+		KGDB_DYN_BRK_INS_BYTE1,
+		KGDB_DYN_BRK_INS_BYTE2,
+		KGDB_DYN_BRK_INS_BYTE3,
+	}
+};
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1348,8 +1348,8 @@ early_initcall(init_hw_perf_events);
 * Callchain handling code.
 */
 struct frame_tail {
-	struct frame_tail   __user *fp;
-	unsigned long	    lr;
+	struct frame_tail	__user *fp;
+	unsigned long		lr;
 } __attribute__((packed));

 /*
@@ -1386,22 +1386,80 @@ user_backtrace(struct frame_tail __user *tail,
 	return buftail.fp;
 }

+/*
+ * The registers we're interested in are at the end of the variable
+ * length saved register structure. The fp points at the end of this
+ * structure so the address of this struct is:
+ * (struct compat_frame_tail *)(xxx->fp)-1
+ *
+ * This code has been adapted from the ARM OProfile support.
+ */
+struct compat_frame_tail {
+	compat_uptr_t	fp; /* a (struct compat_frame_tail *) in compat mode */
+	u32		sp;
+	u32		lr;
+} __attribute__((packed));
+
+static struct compat_frame_tail __user *
+compat_user_backtrace(struct compat_frame_tail __user *tail,
+		      struct perf_callchain_entry *entry)
+{
+	struct compat_frame_tail buftail;
+	unsigned long err;
+
+	/* Also check accessibility of one struct frame_tail beyond */
+	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
+		return NULL;
+
+	pagefault_disable();
+	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
+	pagefault_enable();
+
+	if (err)
+		return NULL;
+
+	perf_callchain_store(entry, buftail.lr);
+
+	/*
+	 * Frame pointers should strictly progress back up the stack
+	 * (towards higher addresses).
+	 */
+	if (tail + 1 >= (struct compat_frame_tail __user *)
+			compat_ptr(buftail.fp))
+		return NULL;
+
+	return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1;
+}
+
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	struct frame_tail __user *tail;
-
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* We don't support guest os callchain now */
 		return;
 	}

 	perf_callchain_store(entry, regs->pc);
-	tail = (struct frame_tail __user *)regs->regs[29];

-	while (entry->nr < PERF_MAX_STACK_DEPTH &&
-	       tail && !((unsigned long)tail & 0xf))
-		tail = user_backtrace(tail, entry);
+	if (!compat_user_mode(regs)) {
+		/* AARCH64 mode */
+		struct frame_tail __user *tail;
+
+		tail = (struct frame_tail __user *)regs->regs[29];
+
+		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		       tail && !((unsigned long)tail & 0xf))
+			tail = user_backtrace(tail, entry);
+	} else {
+		/* AARCH32 compat mode */
+		struct compat_frame_tail __user *tail;
+
+		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
+
+		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+			tail && !((unsigned long)tail & 0x3))
+			tail = compat_user_backtrace(tail, entry);
+	}
 }

 /*
@@ -1429,6 +1487,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	frame.fp = regs->regs[29];
 	frame.sp = regs->sp;
 	frame.pc = regs->pc;
+
 	walk_stackframe(&frame, callchain_trace, entry);
 }


--- a/arch/arm64/kernel/perf_regs.c
+++ b/arch/arm64/kernel/perf_regs.c
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/bug.h>
+#include <asm/perf_regs.h>
+#include <asm/ptrace.h>
+
+u64 perf_reg_value(struct pt_regs *regs, int idx)
+{
+	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX))
+		return 0;
+
+	/*
+	 * Compat (i.e. 32 bit) mode:
+	 * - PC has been set in the pt_regs struct in kernel_entry,
+	 * - Handle SP and LR here.
+	 */
+	if (compat_user_mode(regs)) {
+		if ((u32)idx == PERF_REG_ARM64_SP)
+			return regs->compat_sp;
+		if ((u32)idx == PERF_REG_ARM64_LR)
+			return regs->compat_lr;
+	}
+
+	return regs->regs[idx];
+}
+
+#define REG_RESERVED (~((1ULL << PERF_REG_ARM64_MAX) - 1))
+
+int perf_reg_validate(u64 mask)
+{
+	if (!mask || mask & REG_RESERVED)
+		return -EINVAL;
+
+	return 0;
+}
+
+u64 perf_reg_abi(struct task_struct *task)
+{
+	if (is_compat_thread(task_thread_info(task)))
+		return PERF_SAMPLE_REGS_ABI_32;
+	else
+		return PERF_SAMPLE_REGS_ABI_64;
+}
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -71,8 +71,17 @@ static void setup_restart(void)

 void soft_restart(unsigned long addr)
 {
+	typedef void (*phys_reset_t)(unsigned long);
+	phys_reset_t phys_reset;
+
 	setup_restart();
-	cpu_reset(addr);
+
+	/* Switch to the identity mapping */
+	phys_reset = (phys_reset_t)virt_to_phys(cpu_reset);
+	phys_reset(addr);
+
+	/* Should never get here */
+	BUG();
 }

 /*

--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -176,22 +176,20 @@ static const struct of_device_id psci_of_match[] __initconst = {
 	{},
 };

-int __init psci_init(void)
+void __init psci_init(void)
 {
 	struct device_node *np;
 	const char *method;
 	u32 id;
-	int err = 0;

 	np = of_find_matching_node(NULL, psci_of_match);
 	if (!np)
-		return -ENODEV;
+		return;

 	pr_info("probing function IDs from device-tree\n");

 	if (of_property_read_string(np, "method", &method)) {
 		pr_warning("missing \"method\" property\n");
-		err = -ENXIO;
 		goto out_put_node;
 	}

@@ -201,7 +199,6 @@ int __init psci_init(void)
 		invoke_psci_fn = __invoke_psci_fn_smc;
 	} else {
 		pr_warning("invalid \"method\" property: %s\n", method);
-		err = -EINVAL;
 		goto out_put_node;
 	}

@@ -227,7 +224,7 @@ int __init psci_init(void)

 out_put_node:
 	of_node_put(np);
-	return err;
+	return;
 }

 #ifdef CONFIG_SMP
@@ -251,7 +248,7 @@ static int cpu_psci_cpu_boot(unsigned int cpu)
 {
 	int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa(secondary_entry));
 	if (err)
-		pr_err("psci: failed to boot CPU%d (%d)\n", cpu, err);
+		pr_err("failed to boot CPU%d (%d)\n", cpu, err);

 	return err;
 }
@@ -278,7 +275,7 @@ static void cpu_psci_cpu_die(unsigned int cpu)

 	ret = psci_ops.cpu_off(state);

-	pr_crit("psci: unable to power off CPU%u (%d)\n", cpu, ret);
+	pr_crit("unable to power off CPU%u (%d)\n", cpu, ret);
 }
 #endif


--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -69,6 +69,7 @@ EXPORT_SYMBOL_GPL(elf_hwcap);
 				 COMPAT_HWCAP_VFPv3|COMPAT_HWCAP_VFPv4|\
 				 COMPAT_HWCAP_NEON|COMPAT_HWCAP_IDIV)
 unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
+unsigned int compat_elf_hwcap2 __read_mostly;
 #endif

 static const char *cpu_name;
@@ -242,6 +243,38 @@ static void __init setup_processor(void)
 	block = (features >> 16) & 0xf;
 	if (block && !(block & 0x8))
 		elf_hwcap |= HWCAP_CRC32;
+
+#ifdef CONFIG_COMPAT
+	/*
+	 * ID_ISAR5_EL1 carries similar information as above, but pertaining to
+	 * the Aarch32 32-bit execution state.
+	 */
+	features = read_cpuid(ID_ISAR5_EL1);
+	block = (features >> 4) & 0xf;
+	if (!(block & 0x8)) {
+		switch (block) {
+		default:
+		case 2:
+			compat_elf_hwcap2 |= COMPAT_HWCAP2_PMULL;
+		case 1:
+			compat_elf_hwcap2 |= COMPAT_HWCAP2_AES;
+		case 0:
+			break;
+		}
+	}
+
+	block = (features >> 8) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA1;
+
+	block = (features >> 12) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_SHA2;
+
+	block = (features >> 16) & 0xf;
+	if (block && !(block & 0x8))
+		compat_elf_hwcap2 |= COMPAT_HWCAP2_CRC32;
+#endif
 }

 static void __init setup_machine_fdt(phys_addr_t dt_phys)

--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -114,6 +114,11 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 	return ret;
 }

+static void smp_store_cpu_info(unsigned int cpuid)
+{
+	store_cpu_topology(cpuid);
+}
+
 /*
 * This is the secondary CPU boot entry.  We're using this CPUs
 * idle thread stack, but a set of temporary page tables.
@@ -152,6 +157,8 @@ asmlinkage void secondary_start_kernel(void)
 	 */
 	notify_cpu_starting(cpu);

+	smp_store_cpu_info(cpu);
+
 	/*
 	 * OK, now it's safe to let the boot CPU continue.  Wait for
 	 * the CPU migration code to notice that the CPU is online
@@ -160,6 +167,7 @@ asmlinkage void secondary_start_kernel(void)
 	set_cpu_online(cpu, true);
 	complete(&cpu_running);

+	local_dbg_enable();
 	local_irq_enable();
 	local_async_enable();

@@ -390,6 +398,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	int err;
 	unsigned int cpu, ncores = num_possible_cpus();

+	init_cpu_topology();
+
+	smp_store_cpu_info(smp_processor_id());
+
 	/*
 	 * are we trying to boot more cores than exist?
 	 */

--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -128,7 +128,7 @@ static int smp_spin_table_cpu_boot(unsigned int cpu)
 	return secondary_holding_pen_release != INVALID_HWID ? -ENOSYS : 0;
 }

-void smp_spin_table_cpu_postboot(void)
+static void smp_spin_table_cpu_postboot(void)
 {
 	/*
 	 * Let the primary processor know we're out of the pen.

--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
+/*
+ * arch/arm64/kernel/topology.c
+ *
+ * Copyright (C) 2011,2013,2014 Linaro Limited.
+ *
+ * Based on the arm32 version written by Vincent Guittot in turn based on
+ * arch/sh/kernel/topology.c
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/node.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+
+#include <asm/topology.h>
+
+/*
+ * cpu topology table
+ */
+struct cpu_topology cpu_topology[NR_CPUS];
+EXPORT_SYMBOL_GPL(cpu_topology);
+
+const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+	return &cpu_topology[cpu].core_sibling;
+}
+
+static void update_siblings_masks(unsigned int cpuid)
+{
+	struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
+	int cpu;
+
+	if (cpuid_topo->cluster_id == -1) {
+		/*
+		 * DT does not contain topology information for this cpu
+		 * reset it to default behaviour
+		 */
+		pr_debug("CPU%u: No topology information configured\n", cpuid);
+		cpuid_topo->core_id = 0;
+		cpumask_set_cpu(cpuid, &cpuid_topo->core_sibling);
+		cpumask_set_cpu(cpuid, &cpuid_topo->thread_sibling);
+		return;
+	}
+
+	/* update core and thread sibling masks */
+	for_each_possible_cpu(cpu) {
+		cpu_topo = &cpu_topology[cpu];
+
+		if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
+			continue;
+
+		cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
+		if (cpu != cpuid)
+			cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
+
+		if (cpuid_topo->core_id != cpu_topo->core_id)
+			continue;
+
+		cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
+		if (cpu != cpuid)
+			cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
+	}
+}
+
+void store_cpu_topology(unsigned int cpuid)
+{
+	update_siblings_masks(cpuid);
+}
+
+/*
+ * init_cpu_topology is called at boot when only one cpu is running
+ * which prevent simultaneous write access to cpu_topology array
+ */
+void __init init_cpu_topology(void)
+{
+	unsigned int cpu;
+
+	/* init core mask and power*/
+	for_each_possible_cpu(cpu) {
+		struct cpu_topology *cpu_topo = &cpu_topology[cpu];
+
+		cpu_topo->thread_id = -1;
+		cpu_topo->core_id =  -1;
+		cpu_topo->cluster_id = -1;
+		cpumask_clear(&cpu_topo->core_sibling);
+		cpumask_clear(&cpu_topo->thread_sibling);
+	}
+}
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -106,49 +106,31 @@ int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp)

 static int __init vdso_init(void)
 {
-	struct page *pg;
-	char *vbase;
-	int i, ret = 0;
+	int i;
+
+	if (memcmp(&vdso_start, "\177ELF", 4)) {
+		pr_err("vDSO is not a valid ELF object!\n");
+		return -EINVAL;
+	}

 	vdso_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT;
 	pr_info("vdso: %ld pages (%ld code, %ld data) at base %p\n",
 		vdso_pages + 1, vdso_pages, 1L, &vdso_start);

 	/* Allocate the vDSO pagelist, plus a page for the data. */
-	vdso_pagelist = kzalloc(sizeof(struct page *) * (vdso_pages + 1),
+	vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *),
 				GFP_KERNEL);
-	if (vdso_pagelist == NULL) {
-		pr_err("Failed to allocate vDSO pagelist!\n");
+	if (vdso_pagelist == NULL)
 		return -ENOMEM;
-	}

 	/* Grab the vDSO code pages. */
-	for (i = 0; i < vdso_pages; i++) {
-		pg = virt_to_page(&vdso_start + i*PAGE_SIZE);
-		ClearPageReserved(pg);
-		get_page(pg);
-		vdso_pagelist[i] = pg;
-	}
-
-	/* Sanity check the shared object header. */
-	vbase = vmap(vdso_pagelist, 1, 0, PAGE_KERNEL);
-	if (vbase == NULL) {
-		pr_err("Failed to map vDSO pagelist!\n");
-		return -ENOMEM;
-	} else if (memcmp(vbase, "\177ELF", 4)) {
-		pr_err("vDSO is not a valid ELF object!\n");
-		ret = -EINVAL;
-		goto unmap;
-	}
+	for (i = 0; i < vdso_pages; i++)
+		vdso_pagelist[i] = virt_to_page(&vdso_start + i * PAGE_SIZE);

 	/* Grab the vDSO data page. */
-	pg = virt_to_page(vdso_data);
-	get_page(pg);
-	vdso_pagelist[i] = pg;
+	vdso_pagelist[i] = virt_to_page(vdso_data);

-unmap:
-	vunmap(vbase);
-	return ret;
+	return 0;
 }
 arch_initcall(vdso_init);


--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -68,6 +68,12 @@ __do_hyp_init:
 	msr	tcr_el2, x4

 	ldr	x4, =VTCR_EL2_FLAGS
+	/*
+	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
+	 * VTCR_EL2.
+	 */
+	mrs	x5, ID_AA64MMFR0_EL1
+	bfi	x4, x5, #16, #3
 	msr	vtcr_el2, x4

 	mrs	x4, mair_el1

--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -30,7 +30,7 @@
 *
 *	Corrupted registers: x0-x7, x9-x11
 */
-ENTRY(__flush_dcache_all)
+__flush_dcache_all:
 	dsb	sy				// ensure ordering with previous memory accesses
 	mrs	x0, clidr_el1			// read clidr
 	and	x3, x0, #0x7000000		// extract loc from clidr
@@ -166,3 +166,81 @@ ENTRY(__flush_dcache_area)
 	dsb	sy
 	ret
 ENDPROC(__flush_dcache_area)
+
+/*
+ *	__dma_inv_range(start, end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+__dma_inv_range:
+	dcache_line_size x2, x3
+	sub	x3, x2, #1
+	bic	x0, x0, x3
+	bic	x1, x1, x3
+1:	dc	ivac, x0			// invalidate D / U line
+	add	x0, x0, x2
+	cmp	x0, x1
+	b.lo	1b
+	dsb	sy
+	ret
+ENDPROC(__dma_inv_range)
+
+/*
+ *	__dma_clean_range(start, end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+__dma_clean_range:
+	dcache_line_size x2, x3
+	sub	x3, x2, #1
+	bic	x0, x0, x3
+1:	dc	cvac, x0			// clean D / U line
+	add	x0, x0, x2
+	cmp	x0, x1
+	b.lo	1b
+	dsb	sy
+	ret
+ENDPROC(__dma_clean_range)
+
+/*
+ *	__dma_flush_range(start, end)
+ *	- start   - virtual start address of region
+ *	- end     - virtual end address of region
+ */
+ENTRY(__dma_flush_range)
+	dcache_line_size x2, x3
+	sub	x3, x2, #1
+	bic	x0, x0, x3
+1:	dc	civac, x0			// clean & invalidate D / U line
+	add	x0, x0, x2
+	cmp	x0, x1
+	b.lo	1b
+	dsb	sy
+	ret
+ENDPROC(__dma_flush_range)
+
+/*
+ *	__dma_map_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(__dma_map_area)
+	add	x1, x1, x0
+	cmp	w2, #DMA_FROM_DEVICE
+	b.eq	__dma_inv_range
+	b	__dma_clean_range
+ENDPROC(__dma_map_area)
+
+/*
+ *	__dma_unmap_area(start, size, dir)
+ *	- start	- kernel virtual start address
+ *	- size	- size of region
+ *	- dir	- DMA direction
+ */
+ENTRY(__dma_unmap_area)
+	add	x1, x1, x0
+	cmp	w2, #DMA_TO_DEVICE
+	b.ne	__dma_inv_range
+	ret
+ENDPROC(__dma_unmap_area)
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -30,18 +30,26 @@
 struct dma_map_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);

-static void *arm64_swiotlb_alloc_coherent(struct device *dev, size_t size,
-					  dma_addr_t *dma_handle, gfp_t flags,
-					  struct dma_attrs *attrs)
+static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot,
+				 bool coherent)
+{
+	if (!coherent || dma_get_attr(DMA_ATTR_WRITE_COMBINE, attrs))
+		return pgprot_writecombine(prot);
+	return prot;
+}
+
+static void *__dma_alloc_coherent(struct device *dev, size_t size,
+				  dma_addr_t *dma_handle, gfp_t flags,
+				  struct dma_attrs *attrs)
 {
 	if (dev == NULL) {
 		WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
 		return NULL;
 	}

-	if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
+	if (IS_ENABLED(CONFIG_ZONE_DMA) &&
 	    dev->coherent_dma_mask <= DMA_BIT_MASK(32))
-		flags |= GFP_DMA32;
+		flags |= GFP_DMA;
 	if (IS_ENABLED(CONFIG_DMA_CMA)) {
 		struct page *page;

@@ -58,9 +66,9 @@ static void *arm64_swiotlb_alloc_coherent(struct device *dev, size_t size,
 	}
 }

-static void arm64_swiotlb_free_coherent(struct device *dev, size_t size,
-					void *vaddr, dma_addr_t dma_handle,
-					struct dma_attrs *attrs)
+static void __dma_free_coherent(struct device *dev, size_t size,
+				void *vaddr, dma_addr_t dma_handle,
+				struct dma_attrs *attrs)
 {
 	if (dev == NULL) {
 		WARN_ONCE(1, "Use an actual device structure for DMA allocation\n");
@@ -78,9 +86,212 @@ static void arm64_swiotlb_free_coherent(struct device *dev, size_t size,
 	}
 }

-static struct dma_map_ops arm64_swiotlb_dma_ops = {
-	.alloc = arm64_swiotlb_alloc_coherent,
-	.free = arm64_swiotlb_free_coherent,
+static void *__dma_alloc_noncoherent(struct device *dev, size_t size,
+				     dma_addr_t *dma_handle, gfp_t flags,
+				     struct dma_attrs *attrs)
+{
+	struct page *page, **map;
+	void *ptr, *coherent_ptr;
+	int order, i;
+
+	size = PAGE_ALIGN(size);
+	order = get_order(size);
+
+	ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs);
+	if (!ptr)
+		goto no_mem;
+	map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA);
+	if (!map)
+		goto no_map;
+
+	/* remove any dirty cache lines on the kernel alias */
+	__dma_flush_range(ptr, ptr + size);
+
+	/* create a coherent mapping */
+	page = virt_to_page(ptr);
+	for (i = 0; i < (size >> PAGE_SHIFT); i++)
+		map[i] = page + i;
+	coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP,
+			    __get_dma_pgprot(attrs, pgprot_default, false));
+	kfree(map);
+	if (!coherent_ptr)
+		goto no_map;
+
+	return coherent_ptr;
+
+no_map:
+	__dma_free_coherent(dev, size, ptr, *dma_handle, attrs);
+no_mem:
+	*dma_handle = ~0;
+	return NULL;
+}
+
+static void __dma_free_noncoherent(struct device *dev, size_t size,
+				   void *vaddr, dma_addr_t dma_handle,
+				   struct dma_attrs *attrs)
+{
+	void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle));
+
+	vunmap(vaddr);
+	__dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs);
+}
+
+static dma_addr_t __swiotlb_map_page(struct device *dev, struct page *page,
+				     unsigned long offset, size_t size,
+				     enum dma_data_direction dir,
+				     struct dma_attrs *attrs)
+{
+	dma_addr_t dev_addr;
+
+	dev_addr = swiotlb_map_page(dev, page, offset, size, dir, attrs);
+	__dma_map_area(phys_to_virt(dma_to_phys(dev, dev_addr)), size, dir);
+
+	return dev_addr;
+}
+
+
+static void __swiotlb_unmap_page(struct device *dev, dma_addr_t dev_addr,
+				 size_t size, enum dma_data_direction dir,
+				 struct dma_attrs *attrs)
+{
+	__dma_unmap_area(phys_to_virt(dma_to_phys(dev, dev_addr)), size, dir);
+	swiotlb_unmap_page(dev, dev_addr, size, dir, attrs);
+}
+
+static int __swiotlb_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
+				  int nelems, enum dma_data_direction dir,
+				  struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i, ret;
+
+	ret = swiotlb_map_sg_attrs(dev, sgl, nelems, dir, attrs);
+	for_each_sg(sgl, sg, ret, i)
+		__dma_map_area(phys_to_virt(dma_to_phys(dev, sg->dma_address)),
+			       sg->length, dir);
+
+	return ret;
+}
+
+static void __swiotlb_unmap_sg_attrs(struct device *dev,
+				     struct scatterlist *sgl, int nelems,
+				     enum dma_data_direction dir,
+				     struct dma_attrs *attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_unmap_area(phys_to_virt(dma_to_phys(dev, sg->dma_address)),
+				 sg->length, dir);
+	swiotlb_unmap_sg_attrs(dev, sgl, nelems, dir, attrs);
+}
+
+static void __swiotlb_sync_single_for_cpu(struct device *dev,
+					  dma_addr_t dev_addr, size_t size,
+					  enum dma_data_direction dir)
+{
+	__dma_unmap_area(phys_to_virt(dma_to_phys(dev, dev_addr)), size, dir);
+	swiotlb_sync_single_for_cpu(dev, dev_addr, size, dir);
+}
+
+static void __swiotlb_sync_single_for_device(struct device *dev,
+					     dma_addr_t dev_addr, size_t size,
+					     enum dma_data_direction dir)
+{
+	swiotlb_sync_single_for_device(dev, dev_addr, size, dir);
+	__dma_map_area(phys_to_virt(dma_to_phys(dev, dev_addr)), size, dir);
+}
+
+static void __swiotlb_sync_sg_for_cpu(struct device *dev,
+				      struct scatterlist *sgl, int nelems,
+				      enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_unmap_area(phys_to_virt(dma_to_phys(dev, sg->dma_address)),
+				 sg->length, dir);
+	swiotlb_sync_sg_for_cpu(dev, sgl, nelems, dir);
+}
+
+static void __swiotlb_sync_sg_for_device(struct device *dev,
+					 struct scatterlist *sgl, int nelems,
+					 enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	swiotlb_sync_sg_for_device(dev, sgl, nelems, dir);
+	for_each_sg(sgl, sg, nelems, i)
+		__dma_map_area(phys_to_virt(dma_to_phys(dev, sg->dma_address)),
+			       sg->length, dir);
+}
+
+/* vma->vm_page_prot must be set appropriately before calling this function */
+static int __dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
+			     void *cpu_addr, dma_addr_t dma_addr, size_t size)
+{
+	int ret = -ENXIO;
+	unsigned long nr_vma_pages = (vma->vm_end - vma->vm_start) >>
+					PAGE_SHIFT;
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn = dma_to_phys(dev, dma_addr) >> PAGE_SHIFT;
+	unsigned long off = vma->vm_pgoff;
+
+	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) {
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      pfn + off,
+				      vma->vm_end - vma->vm_start,
+				      vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int __swiotlb_mmap_noncoherent(struct device *dev,
+		struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		struct dma_attrs *attrs)
+{
+	vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, false);
+	return __dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+
+static int __swiotlb_mmap_coherent(struct device *dev,
+		struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		struct dma_attrs *attrs)
+{
+	/* Just use whatever page_prot attributes were specified */
+	return __dma_common_mmap(dev, vma, cpu_addr, dma_addr, size);
+}
+
+struct dma_map_ops noncoherent_swiotlb_dma_ops = {
+	.alloc = __dma_alloc_noncoherent,
+	.free = __dma_free_noncoherent,
+	.mmap = __swiotlb_mmap_noncoherent,
+	.map_page = __swiotlb_map_page,
+	.unmap_page = __swiotlb_unmap_page,
+	.map_sg = __swiotlb_map_sg_attrs,
+	.unmap_sg = __swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu = __swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = __swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu = __swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device = __swiotlb_sync_sg_for_device,
+	.dma_supported = swiotlb_dma_supported,
+	.mapping_error = swiotlb_dma_mapping_error,
+};
+EXPORT_SYMBOL(noncoherent_swiotlb_dma_ops);
+
+struct dma_map_ops coherent_swiotlb_dma_ops = {
+	.alloc = __dma_alloc_coherent,
+	.free = __dma_free_coherent,
+	.mmap = __swiotlb_mmap_coherent,
 	.map_page = swiotlb_map_page,
 	.unmap_page = swiotlb_unmap_page,
 	.map_sg = swiotlb_map_sg_attrs,
@@ -92,12 +303,19 @@ static struct dma_map_ops arm64_swiotlb_dma_ops = {
 	.dma_supported = swiotlb_dma_supported,
 	.mapping_error = swiotlb_dma_mapping_error,
 };
+EXPORT_SYMBOL(coherent_swiotlb_dma_ops);
+
+extern int swiotlb_late_init_with_default_size(size_t default_size);

-void __init arm64_swiotlb_init(void)
+static int __init swiotlb_late_init(void)
 {
-	dma_ops = &arm64_swiotlb_dma_ops;
-	swiotlb_init(1);
+	size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT);
+
+	dma_ops = &coherent_swiotlb_dma_ops;
+
+	return swiotlb_late_init_with_default_size(swiotlb_size);
 }
+subsys_initcall(swiotlb_late_init);

 #define PREALLOC_DMA_DEBUG_ENTRIES	4096


--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -30,6 +30,7 @@
 #include <linux/memblock.h>
 #include <linux/sort.h>
 #include <linux/of_fdt.h>
+#include <linux/dma-mapping.h>
 #include <linux/dma-contiguous.h>

 #include <asm/sections.h>
@@ -59,22 +60,22 @@ static int __init early_initrd(char *p)
 early_param("initrd", early_initrd);
 #endif

-#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
-
 static void __init zone_sizes_init(unsigned long min, unsigned long max)
 {
 	struct memblock_region *reg;
 	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
-	unsigned long max_dma32 = min;
+	unsigned long max_dma = min;

 	memset(zone_size, 0, sizeof(zone_size));

-#ifdef CONFIG_ZONE_DMA32
 	/* 4GB maximum for 32-bit only capable devices */
-	max_dma32 = max(min, min(max, MAX_DMA32_PFN));
-	zone_size[ZONE_DMA32] = max_dma32 - min;
-#endif
-	zone_size[ZONE_NORMAL] = max - max_dma32;
+	if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+		unsigned long max_dma_phys =
+			(unsigned long)dma_to_phys(NULL, DMA_BIT_MASK(32) + 1);
+		max_dma = max(min, min(max, max_dma_phys >> PAGE_SHIFT));
+		zone_size[ZONE_DMA] = max_dma - min;
+	}
+	zone_size[ZONE_NORMAL] = max - max_dma;

 	memcpy(zhole_size, zone_size, sizeof(zhole_size));

@@ -84,15 +85,15 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)

 		if (start >= max)
 			continue;
-#ifdef CONFIG_ZONE_DMA32
-		if (start < max_dma32) {
-			unsigned long dma_end = min(end, max_dma32);
-			zhole_size[ZONE_DMA32] -= dma_end - start;
+
+		if (IS_ENABLED(CONFIG_ZONE_DMA) && start < max_dma) {
+			unsigned long dma_end = min(end, max_dma);
+			zhole_size[ZONE_DMA] -= dma_end - start;
 		}
-#endif
-		if (end > max_dma32) {
+
+		if (end > max_dma) {
 			unsigned long normal_end = min(end, max);
-			unsigned long normal_start = max(start, max_dma32);
+			unsigned long normal_start = max(start, max_dma);
 			zhole_size[ZONE_NORMAL] -= normal_end - normal_start;
 		}
 	}
@@ -261,8 +262,6 @@ static void __init free_unused_memmap(void)
 */
 void __init mem_init(void)
 {
-	arm64_swiotlb_init();
-
 	max_mapnr   = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;

 #ifndef CONFIG_SPARSEMEM_VMEMMAP

--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -173,12 +173,6 @@ ENDPROC(cpu_do_switch_mm)
 *	value of the SCTLR_EL1 register.
 */
 ENTRY(__cpu_setup)
-	/*
-	 * Preserve the link register across the function call.
-	 */
-	mov	x28, lr
-	bl	__flush_dcache_all
-	mov	lr, x28
 	ic	iallu				// I+BTB cache invalidate
 	tlbi	vmalle1is			// invalidate I + D TLBs
 	dsb	sy
@@ -215,8 +209,14 @@ ENTRY(__cpu_setup)
 	 * Set/prepare TCR and TTBR. We use 512GB (39-bit) address range for
 	 * both user and kernel.
 	 */
-	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_FLAGS | TCR_IPS_40BIT | \
+	ldr	x10, =TCR_TxSZ(VA_BITS) | TCR_FLAGS | \
 		      TCR_ASID16 | TCR_TBI0 | (1 << 31)
+	/*
+	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the IPS bits in
+	 * TCR_EL1.
+	 */
+	mrs	x9, ID_AA64MMFR0_EL1
+	bfi	x10, x9, #32, #3
 #ifdef CONFIG_ARM64_64K_PAGES
 	orr	x10, x10, TCR_TG0_64K
 	orr	x10, x10, TCR_TG1_64K

--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -200,7 +200,7 @@ source "drivers/cpufreq/Kconfig.x86"
 endmenu

 menu "ARM CPU frequency scaling drivers"
-depends on ARM
+depends on ARM || ARM64
 source "drivers/cpufreq/Kconfig.arm"
 endmenu


--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
 #define	ELF_HWCAP		COMPAT_ELF_HWCAP
 #endif

+#ifdef	COMPAT_ELF_HWCAP2
+#undef	ELF_HWCAP2
+#define	ELF_HWCAP2		COMPAT_ELF_HWCAP2
+#endif
+
 #ifdef	COMPAT_ARCH_DLINFO
 #undef	ARCH_DLINFO
 #define	ARCH_DLINFO		COMPAT_ARCH_DLINFO

--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
-#ifndef _ASM_POWERPC_RWSEM_H
-#define _ASM_POWERPC_RWSEM_H
+#ifndef _ASM_GENERIC_RWSEM_H
+#define _ASM_GENERIC_RWSEM_H

 #ifndef _LINUX_RWSEM_H
 #error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead."
@@ -8,7 +8,7 @@
 #ifdef __KERNEL__

 /*
- * R/W semaphores for PPC using the stuff in lib/rwsem.c.
+ * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
 * Adapted largely from include/asm-i386/rwsem.h
 * by Paul Mackerras <paulus@samba.org>.
 */
@@ -16,7 +16,7 @@
 /*
 * the semaphore definition
 */
-#ifdef CONFIG_PPC64
+#ifdef CONFIG_64BIT
 # define RWSEM_ACTIVE_MASK		0xffffffffL
 #else
 # define RWSEM_ACTIVE_MASK		0x0000ffffL
@@ -129,4 +129,4 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
 }

 #endif	/* __KERNEL__ */
-#endif	/* _ASM_POWERPC_RWSEM_H */
+#endif	/* _ASM_GENERIC_RWSEM_H */
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1035,7 +1035,7 @@ int dbg_io_get_char(void)
 * otherwise as a quick means to stop program execution and "break" into
 * the debugger.
 */
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
 {
 	atomic_inc(&kgdb_setting_breakpoint);
 	wmb(); /* Sync point before breakpoint */