1. 15 Dec, 2011 4 commits
    • David Howells's avatar
      x86_64, asm: Optimise fls(), ffs() and fls64() · ca3d30cc
      David Howells authored
      fls(N), ffs(N) and fls64(N) can be optimised on x86_64.  Currently they use a
      CMOV instruction after the BSR/BSF to set the destination register to -1 if the
      value to be scanned was 0 (in which case BSR/BSF set the Z flag).
      
      Instead, according to the AMD64 specification, we can make use of the fact that
      BSR/BSF doesn't modify its output register if its input is 0.  By preloading
      the output with -1 and incrementing the result, we achieve the desired result
      without the need for a conditional check.
      
      The Intel x86_64 specification, however, says that the result of BSR/BSF in
      such a case is undefined.  That said, when queried, one of the Intel CPU
      architects said that the behaviour on all Intel CPUs is that:
      
       (1) with BSRQ/BSFQ, the 64-bit destination register is written with its
           original value if the source is 0, thus, in essence, giving the effect we
           want.  And,
      
       (2) with BSRL/BSFL, the lower half of the 64-bit destination register is
           written with its original value if the source is 0, and the upper half is
           cleared, thus giving us the effect we want (we return a 4-byte int).
      
      Further, it was indicated that they (Intel) are unlikely to get away with
      changing the behaviour.
      
      It might be possible to optimise the 32-bit versions of these functions, but
      there's a lot more variation, and so the effective non-destructive property of
      BSRL/BSRF cannot be relied on.
      
      [ hpa: specifically, some 486 chips are known to NOT have this property. ]
      
      I have benchmarked these functions on my Core2 Duo test machine using the
      following program:
      
      	#include <stdlib.h>
      	#include <stdio.h>
      
      	#ifndef __x86_64__
      	#error
      	#endif
      
      	#define PAGE_SHIFT 12
      
      	typedef unsigned long long __u64, u64;
      	typedef unsigned int __u32, u32;
      	#define noinline	__attribute__((noinline))
      
      	static __always_inline int fls64(__u64 x)
      	{
      		long bitpos = -1;
      
      		asm("bsrq %1,%0"
      		    : "+r" (bitpos)
      		    : "rm" (x));
      		return bitpos + 1;
      	}
      
      	static inline unsigned long __fls(unsigned long word)
      	{
      		asm("bsr %1,%0"
      		    : "=r" (word)
      		    : "rm" (word));
      		return word;
      	}
      	static __always_inline int old_fls64(__u64 x)
      	{
      		if (x == 0)
      			return 0;
      		return __fls(x) + 1;
      	}
      
      	static noinline // __attribute__((const))
      	int old_get_order(unsigned long size)
      	{
      		int order;
      
      		size = (size - 1) >> (PAGE_SHIFT - 1);
      		order = -1;
      		do {
      			size >>= 1;
      			order++;
      		} while (size);
      		return order;
      	}
      
      	static inline __attribute__((const))
      	int get_order_old_fls64(unsigned long size)
      	{
      		int order;
      		size--;
      		size >>= PAGE_SHIFT;
      		order = old_fls64(size);
      		return order;
      	}
      
      	static inline __attribute__((const))
      	int get_order(unsigned long size)
      	{
      		int order;
      		size--;
      		size >>= PAGE_SHIFT;
      		order = fls64(size);
      		return order;
      	}
      
      	unsigned long prevent_optimise_out;
      
      	static noinline unsigned long test_old_get_order(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += old_get_order(n);
      			}
      		}
      		return total;
      	}
      
      	static noinline unsigned long test_get_order_old_fls64(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += get_order_old_fls64(n);
      			}
      		}
      		return total;
      	}
      
      	static noinline unsigned long test_get_order(void)
      	{
      		unsigned long n, total = 0;
      		long rep, loop;
      
      		for (rep = 1000000; rep > 0; rep--) {
      			for (loop = 0; loop <= 16384; loop += 4) {
      				n = 1UL << loop;
      				total += get_order(n);
      			}
      		}
      		return total;
      	}
      
      	int main(int argc, char **argv)
      	{
      		unsigned long total;
      
      		switch (argc) {
      		case 1:  total = test_old_get_order();		break;
      		case 2:  total = test_get_order_old_fls64();	break;
      		default: total = test_get_order();		break;
      		}
      		prevent_optimise_out = total;
      		return 0;
      	}
      
      This allows me to test the use of the old fls64() implementation and the new
      fls64() implementation and also to contrast these to the out-of-line loop-based
      implementation of get_order().  The results were:
      
      	warthog>time ./get_order
      	real    1m37.191s
      	user    1m36.313s
      	sys     0m0.861s
      	warthog>time ./get_order x
      	real    0m16.892s
      	user    0m16.586s
      	sys     0m0.287s
      	warthog>time ./get_order x x
      	real    0m7.731s
      	user    0m7.727s
      	sys     0m0.002s
      
      Using the current upstream fls64() as a basis for an inlined get_order() [the
      second result above] is much faster than using the current out-of-line
      loop-based get_order() [the first result above].
      
      Using my optimised inline fls64()-based get_order() [the third result above]
      is even faster still.
      
      [ hpa: changed the selection of 32 vs 64 bits to use CONFIG_X86_64
        instead of comparing BITS_PER_LONG, updated comments, rebased manually
        on top of 83d99df7 x86, bitops: Move fls64.h inside __KERNEL__ ]
      Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
      Link: http://lkml.kernel.org/r/20111213145654.14362.39868.stgit@warthog.procyon.org.uk
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Signed-off-by: default avatarH. Peter Anvin <hpa@linux.intel.com>
      ca3d30cc
    • H. Peter Anvin's avatar
      x86, bitops: Move fls64.h inside __KERNEL__ · 83d99df7
      H. Peter Anvin authored
      We would include <asm-generic/bitops/fls64.h> even without __KERNEL__,
      but that doesn't make sense, as:
      
      1. That file provides fls64(), but the corresponding function fls() is
         not exported to user space.
      2. The implementation of fls64.h uses kernel-only symbols.
      3. fls64.h is not exported to user space.
      
      This appears to have been a bug introduced in checkin:
      
      d57594c2 bitops: use __fls for fls64 on 64-bit archs
      
      Cc: Stephen Hemminger <shemminger@vyatta.com>
      Cc: Alexander van Heukelum <heukelum@mailshack.com>
      Cc: David Howells <dhowells@redhat.com>
      Signed-off-by: default avatarH. Peter Anvin <hpa@zytor.com>
      Link: http://lkml.kernel.org/r/4EEA77E1.6050009@zytor.com
      83d99df7
    • Jan Beulich's avatar
      x86: Fix and improve percpu_cmpxchg{8,16}b_double() · cebef5be
      Jan Beulich authored
      They had several problems/shortcomings:
      
      Only the first memory operand was mentioned in the 2x32bit asm()
      operands, and 2x64-bit version had a memory clobber. The first
      allowed the compiler to not recognize the need to re-load the
      data in case it had it cached in some register, and the second
      was overly destructive.
      
      The memory operand in the 2x32-bit asm() was declared to only be
      an output.
      
      The types of the local copies of the old and new values were
      incorrect (as in other per-CPU ops, the types of the per-CPU
      variables accessed should be used here, to make sure the
      respective types are compatible).
      
      The __dummy variable was pointless (and needlessly initialized
      in the 2x32-bit case), given that local copies of the inputs
      already exist.
      
      The 2x64-bit variant forced the address of the first object into
      %rsi, even though this is needed only for the call to the
      emulation function. The real cmpxchg16b can operate on an
      memory.
      
      At once also change the return value type to what it really is -
      'bool'.
      Signed-off-by: default avatarJan Beulich <jbeulich@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Andrew Morton <akpm@linux-foundation.org>
      Cc: David Howells <dhowells@redhat.com>
      Cc: Christoph Lameter <cl@linux.com>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Link: http://lkml.kernel.org/r/4EE86D6502000078000679FE@nat28.tlf.novell.comSigned-off-by: default avatarIngo Molnar <mingo@elte.hu>
      cebef5be
    • Joerg Roedel's avatar
      x86: Report cpb and eff_freq_ro flags correctly · 969df4b8
      Joerg Roedel authored
      Add the flags to get rid of the [9] and [10] feature names
      in cpuinfo's 'power management' fields and replace them with
      meaningful names.
      Signed-off-by: default avatarJoerg Roedel <joerg.roedel@amd.com>
      Link: http://lkml.kernel.org/r/1323875574-17881-1-git-send-email-joerg.roedel@amd.comSigned-off-by: default avatarIngo Molnar <mingo@elte.hu>
      969df4b8
  2. 12 Dec, 2011 1 commit
  3. 07 Dec, 2011 1 commit
  4. 06 Dec, 2011 3 commits
  5. 05 Dec, 2011 9 commits
  6. 04 Dec, 2011 1 commit
    • Linus Torvalds's avatar
      x86: Fix boot failures on older AMD CPU's · 8e8da023
      Linus Torvalds authored
      People with old AMD chips are getting hung boots, because commit
      bcb80e53 ("x86, microcode, AMD: Add microcode revision to
      /proc/cpuinfo") moved the microcode detection too early into
      "early_init_amd()".
      
      At that point we are *so* early in the booth that the exception tables
      haven't even been set up yet, so the whole
      
      	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
      
      doesn't actually work: if the rdmsr does a GP fault (due to non-existant
      MSR register on older CPU's), we can't fix it up yet, and the boot fails.
      
      Fix it by simply moving the code to a slightly later point in the boot
      (init_amd() instead of early_init_amd()), since the kernel itself
      doesn't even really care about the microcode patchlevel at this point
      (or really ever: it's made available to user space in /proc/cpuinfo, and
      updated if you do a microcode load).
      Reported-tested-and-bisected-by: default avatarLarry Finger <Larry.Finger@lwfinger.net>
      Tested-by: default avatarBob Tracy <rct@gherkin.frus.com>
      Acked-by: default avatarBorislav Petkov <borislav.petkov@amd.com>
      Cc: Ingo Molnar <mingo@elte.hu>
      Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      8e8da023
  7. 03 Dec, 2011 1 commit
    • Konrad Rzeszutek Wilk's avatar
      xen/pm_idle: Make pm_idle be default_idle under Xen. · e5fd47bf
      Konrad Rzeszutek Wilk authored
      The idea behind commit d91ee586 ("cpuidle: replace xen access to x86
      pm_idle and default_idle") was to have one call - disable_cpuidle()
      which would make pm_idle not be molested by other code.  It disallows
      cpuidle_idle_call to be set to pm_idle (which is excellent).
      
      But in the select_idle_routine() and idle_setup(), the pm_idle can still
      be set to either: amd_e400_idle, mwait_idle or default_idle.  This
      depends on some CPU flags (MWAIT) and in AMD case on the type of CPU.
      
      In case of mwait_idle we can hit some instances where the hypervisor
      (Amazon EC2 specifically) sets the MWAIT and we get:
      
        Brought up 2 CPUs
        invalid opcode: 0000 [#1] SMP
      
        Pid: 0, comm: swapper Not tainted 3.1.0-0.rc6.git0.3.fc16.x86_64 #1
        RIP: e030:[<ffffffff81015d1d>]  [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4
        ...
        Call Trace:
         [<ffffffff8100e2ed>] cpu_idle+0xae/0xe8
         [<ffffffff8149ee78>] cpu_bringup_and_idle+0xe/0x10
        RIP  [<ffffffff81015d1d>] mwait_idle+0x6f/0xb4
         RSP <ffff8801d28ddf10>
      
      In the case of amd_e400_idle we don't get so spectacular crashes, but we
      do end up making an MSR which is trapped in the hypervisor, and then
      follow it up with a yield hypercall.  Meaning we end up going to
      hypervisor twice instead of just once.
      
      The previous behavior before v3.0 was that pm_idle was set to
      default_idle regardless of select_idle_routine/idle_setup.
      
      We want to do that, but only for one specific case: Xen.  This patch
      does that.
      
      Fixes RH BZ #739499 and Ubuntu #881076
      Reported-by: default avatarStefan Bader <stefan.bader@canonical.com>
      Signed-off-by: default avatarKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      e5fd47bf
  8. 02 Dec, 2011 14 commits
  9. 01 Dec, 2011 6 commits
    • Linus Torvalds's avatar
      Linux 3.2-rc4 · 5611cc45
      Linus Torvalds authored
      5611cc45
    • Linus Torvalds's avatar
      Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2 · 0a4ebed7
      Linus Torvalds authored
      * 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec/ocfs2: (31 commits)
        ocfs2: avoid unaligned access to dqc_bitmap
        ocfs2: Use filemap_write_and_wait() instead of write_inode_now()
        ocfs2: honor O_(D)SYNC flag in fallocate
        ocfs2: Add a missing journal credit in ocfs2_link_credits() -v2
        ocfs2: send correct UUID to cleancache initialization
        ocfs2: Commit transactions in error cases -v2
        ocfs2: make direntry invalid when deleting it
        fs/ocfs2/dlm/dlmlock.c: free kmem_cache_zalloc'd data using kmem_cache_free
        ocfs2: Avoid livelock in ocfs2_readpage()
        ocfs2: serialize unaligned aio
        ocfs2: Implement llseek()
        ocfs2: Fix ocfs2_page_mkwrite()
        ocfs2: Add comment about orphan scanning
        ocfs2: Clean up messages in the fs
        ocfs2/cluster: Cluster up now includes network connections too
        ocfs2/cluster: Add new function o2net_fill_node_map()
        ocfs2/cluster: Fix output in file elapsed_time_in_ms
        ocfs2/dlm: dlmlock_remote() needs to account for remastery
        ocfs2/dlm: Take inflight reference count for remotely mastered resources too
        ocfs2/dlm: Cleanup dlm_wait_for_node_death() and dlm_wait_for_node_recovery()
        ...
      0a4ebed7
    • Akinobu Mita's avatar
      ocfs2: avoid unaligned access to dqc_bitmap · 93925579
      Akinobu Mita authored
      The dqc_bitmap field of struct ocfs2_local_disk_chunk is 32-bit aligned,
      but not 64-bit aligned.  The dqc_bitmap is accessed by ocfs2_set_bit(),
      ocfs2_clear_bit(), ocfs2_test_bit(), or ocfs2_find_next_zero_bit().  These
      are wrapper macros for ext2_*_bit() which need to take an unsigned long
      aligned address (though some architectures are able to handle unaligned
      address correctly)
      
      So some 64bit architectures may not be able to access the dqc_bitmap
      correctly.
      
      This avoids such unaligned access by using another wrapper functions for
      ext2_*_bit().  The code is taken from fs/ext4/mballoc.c which also need to
      handle unaligned bitmap access.
      Signed-off-by: default avatarAkinobu Mita <akinobu.mita@gmail.com>
      Acked-by: default avatarJoel Becker <jlbec@evilplan.org>
      Cc: Mark Fasheh <mfasheh@suse.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarJoel Becker <jlbec@evilplan.org>
      93925579
    • Linus Torvalds's avatar
      Merge branch 'fixes' of http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm · 3b120ab7
      Linus Torvalds authored
      * 'fixes' of http://ftp.arm.linux.org.uk/pub/linux/arm/kernel/git-cur/linux-2.6-arm:
        ARM: 7182/1: ARM cpu topology: fix warning
        ARM: 7181/1: Restrict kprobes probing SWP instructions to ARMv5 and below
        ARM: 7180/1: Change kprobes testcase with unpredictable STRD instruction
        ARM: 7177/1: GIC: avoid skipping non-existent PPIs in irq_start calculation
        ARM: 7176/1: cpu_pm: register GIC PM notifier only once
        ARM: 7175/1: add subname parameter to mfp_set_groupg callers
        ARM: 7174/1: Fix build error in kprobes test code on Thumb2 kernels
        ARM: 7172/1: dma: Drop GFP_COMP for DMA memory allocations
        ARM: 7171/1: unwind: add unwind directives to bitops assembly macros
        ARM: 7170/2: fix compilation breakage in entry-armv.S
        ARM: 7168/1: use cache type functions for arch_get_unmapped_area
        ARM: perf: check that we have a platform device when reserving PMU
        ARM: 7166/1: Use PMD_SHIFT instead of PGDIR_SHIFT in dma-consistent.c
        ARM: 7165/2: PL330: Fix typo in _prepare_ccr()
        ARM: 7163/2: PL330: Only register usable channels
        ARM: 7162/1: errata: tidy up Kconfig options for PL310 errata workarounds
        ARM: 7161/1: errata: no automatic store buffer drain
        ARM: perf: initialise used_mask for fake PMU during validation
        ARM: PMU: remove pmu_init declaration
        ARM: PMU: re-export release_pmu symbol to modules
      3b120ab7
    • David S. Miller's avatar
      Revert "udp: remove redundant variable" · 59c2cdae
      David S. Miller authored
      This reverts commit 81d54ec8.
      
      If we take the "try_again" goto, due to a checksum error,
      the 'len' has already been truncated.  So we won't compute
      the same values as the original code did.
      Reported-by: default avatarpaul bilke <fsmail@conspiracy.net>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      59c2cdae
    • Greg Kroah-Hartman's avatar
      Merge branch 'for-usb-linus' of... · 8593b6f6
      Greg Kroah-Hartman authored
      Merge branch 'for-usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sarah/xhci into usb-linus
      
      * 'for-usb-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sarah/xhci:
        Revert "xHCI: reset-on-resume quirk for NEC uPD720200"
        xHCI: fix bug in xhci_clear_command_ring()
      8593b6f6