1. 24 Feb, 2020 23 commits
    • Sergey Senozhatsky's avatar
      char/random: silence a lockdep splat with printk() · 15341b1d
      Sergey Senozhatsky authored
      [ Upstream commit 1b710b1b ]
      
      Sergey didn't like the locking order,
      
      uart_port->lock  ->  tty_port->lock
      
      uart_write (uart_port->lock)
        __uart_start
          pl011_start_tx
            pl011_tx_chars
              uart_write_wakeup
                tty_port_tty_wakeup
                  tty_port_default
                    tty_port_tty_get (tty_port->lock)
      
      but those code is so old, and I have no clue how to de-couple it after
      checking other locks in the splat. There is an onging effort to make all
      printk() as deferred, so until that happens, workaround it for now as a
      short-term fix.
      
      LTP: starting iogen01 (export LTPROOT; rwtest -N iogen01 -i 120s -s
      read,write -Da -Dv -n 2 500b:$TMPDIR/doio.f1.$$
      1000b:$TMPDIR/doio.f2.$$)
      WARNING: possible circular locking dependency detected
      ------------------------------------------------------
      doio/49441 is trying to acquire lock:
      ffff008b7cff7290 (&(&zone->lock)->rlock){..-.}, at: rmqueue+0x138/0x2050
      
      but task is already holding lock:
      60ff000822352818 (&pool->lock/1){-.-.}, at: start_flush_work+0xd8/0x3f0
      
        which lock already depends on the new lock.
      
        the existing dependency chain (in reverse order) is:
      
        -> #4 (&pool->lock/1){-.-.}:
             lock_acquire+0x320/0x360
             _raw_spin_lock+0x64/0x80
             __queue_work+0x4b4/0xa10
             queue_work_on+0xac/0x11c
             tty_schedule_flip+0x84/0xbc
             tty_flip_buffer_push+0x1c/0x28
             pty_write+0x98/0xd0
             n_tty_write+0x450/0x60c
             tty_write+0x338/0x474
             __vfs_write+0x88/0x214
             vfs_write+0x12c/0x1a4
             redirected_tty_write+0x90/0xdc
             do_loop_readv_writev+0x140/0x180
             do_iter_write+0xe0/0x10c
             vfs_writev+0x134/0x1cc
             do_writev+0xbc/0x130
             __arm64_sys_writev+0x58/0x8c
             el0_svc_handler+0x170/0x240
             el0_sync_handler+0x150/0x250
             el0_sync+0x164/0x180
      
        -> #3 (&(&port->lock)->rlock){-.-.}:
             lock_acquire+0x320/0x360
             _raw_spin_lock_irqsave+0x7c/0x9c
             tty_port_tty_get+0x24/0x60
             tty_port_default_wakeup+0x1c/0x3c
             tty_port_tty_wakeup+0x34/0x40
             uart_write_wakeup+0x28/0x44
             pl011_tx_chars+0x1b8/0x270
             pl011_start_tx+0x24/0x70
             __uart_start+0x5c/0x68
             uart_write+0x164/0x1c8
             do_output_char+0x33c/0x348
             n_tty_write+0x4bc/0x60c
             tty_write+0x338/0x474
             redirected_tty_write+0xc0/0xdc
             do_loop_readv_writev+0x140/0x180
             do_iter_write+0xe0/0x10c
             vfs_writev+0x134/0x1cc
             do_writev+0xbc/0x130
             __arm64_sys_writev+0x58/0x8c
             el0_svc_handler+0x170/0x240
             el0_sync_handler+0x150/0x250
             el0_sync+0x164/0x180
      
        -> #2 (&port_lock_key){-.-.}:
             lock_acquire+0x320/0x360
             _raw_spin_lock+0x64/0x80
             pl011_console_write+0xec/0x2cc
             console_unlock+0x794/0x96c
             vprintk_emit+0x260/0x31c
             vprintk_default+0x54/0x7c
             vprintk_func+0x218/0x254
             printk+0x7c/0xa4
             register_console+0x734/0x7b0
             uart_add_one_port+0x734/0x834
             pl011_register_port+0x6c/0xac
             sbsa_uart_probe+0x234/0x2ec
             platform_drv_probe+0xd4/0x124
             really_probe+0x250/0x71c
             driver_probe_device+0xb4/0x200
             __device_attach_driver+0xd8/0x188
             bus_for_each_drv+0xbc/0x110
             __device_attach+0x120/0x220
             device_initial_probe+0x20/0x2c
             bus_probe_device+0x54/0x100
             device_add+0xae8/0xc2c
             platform_device_add+0x278/0x3b8
             platform_device_register_full+0x238/0x2ac
             acpi_create_platform_device+0x2dc/0x3a8
             acpi_bus_attach+0x390/0x3cc
             acpi_bus_attach+0x108/0x3cc
             acpi_bus_attach+0x108/0x3cc
             acpi_bus_attach+0x108/0x3cc
             acpi_bus_scan+0x7c/0xb0
             acpi_scan_init+0xe4/0x304
             acpi_init+0x100/0x114
             do_one_initcall+0x348/0x6a0
             do_initcall_level+0x190/0x1fc
             do_basic_setup+0x34/0x4c
             kernel_init_freeable+0x19c/0x260
             kernel_init+0x18/0x338
             ret_from_fork+0x10/0x18
      
        -> #1 (console_owner){-...}:
             lock_acquire+0x320/0x360
             console_lock_spinning_enable+0x6c/0x7c
             console_unlock+0x4f8/0x96c
             vprintk_emit+0x260/0x31c
             vprintk_default+0x54/0x7c
             vprintk_func+0x218/0x254
             printk+0x7c/0xa4
             get_random_u64+0x1c4/0x1dc
             shuffle_pick_tail+0x40/0xac
             __free_one_page+0x424/0x710
             free_one_page+0x70/0x120
             __free_pages_ok+0x61c/0xa94
             __free_pages_core+0x1bc/0x294
             memblock_free_pages+0x38/0x48
             __free_pages_memory+0xcc/0xfc
             __free_memory_core+0x70/0x78
             free_low_memory_core_early+0x148/0x18c
             memblock_free_all+0x18/0x54
             mem_init+0xb4/0x17c
             mm_init+0x14/0x38
             start_kernel+0x19c/0x530
      
        -> #0 (&(&zone->lock)->rlock){..-.}:
             validate_chain+0xf6c/0x2e2c
             __lock_acquire+0x868/0xc2c
             lock_acquire+0x320/0x360
             _raw_spin_lock+0x64/0x80
             rmqueue+0x138/0x2050
             get_page_from_freelist+0x474/0x688
             __alloc_pages_nodemask+0x3b4/0x18dc
             alloc_pages_current+0xd0/0xe0
             alloc_slab_page+0x2b4/0x5e0
             new_slab+0xc8/0x6bc
             ___slab_alloc+0x3b8/0x640
             kmem_cache_alloc+0x4b4/0x588
             __debug_object_init+0x778/0x8b4
             debug_object_init_on_stack+0x40/0x50
             start_flush_work+0x16c/0x3f0
             __flush_work+0xb8/0x124
             flush_work+0x20/0x30
             xlog_cil_force_lsn+0x88/0x204 [xfs]
             xfs_log_force_lsn+0x128/0x1b8 [xfs]
             xfs_file_fsync+0x3c4/0x488 [xfs]
             vfs_fsync_range+0xb0/0xd0
             generic_write_sync+0x80/0xa0 [xfs]
             xfs_file_buffered_aio_write+0x66c/0x6e4 [xfs]
             xfs_file_write_iter+0x1a0/0x218 [xfs]
             __vfs_write+0x1cc/0x214
             vfs_write+0x12c/0x1a4
             ksys_write+0xb0/0x120
             __arm64_sys_write+0x54/0x88
             el0_svc_handler+0x170/0x240
             el0_sync_handler+0x150/0x250
             el0_sync+0x164/0x180
      
             other info that might help us debug this:
      
       Chain exists of:
         &(&zone->lock)->rlock --> &(&port->lock)->rlock --> &pool->lock/1
      
       Possible unsafe locking scenario:
      
             CPU0                    CPU1
             ----                    ----
        lock(&pool->lock/1);
                                     lock(&(&port->lock)->rlock);
                                     lock(&pool->lock/1);
        lock(&(&zone->lock)->rlock);
      
                      *** DEADLOCK ***
      
      4 locks held by doio/49441:
       #0: a0ff00886fc27408 (sb_writers#8){.+.+}, at: vfs_write+0x118/0x1a4
       #1: 8fff00080810dfe0 (&xfs_nondir_ilock_class){++++}, at:
      xfs_ilock+0x2a8/0x300 [xfs]
       #2: ffff9000129f2390 (rcu_read_lock){....}, at:
      rcu_lock_acquire+0x8/0x38
       #3: 60ff000822352818 (&pool->lock/1){-.-.}, at:
      start_flush_work+0xd8/0x3f0
      
                     stack backtrace:
      CPU: 48 PID: 49441 Comm: doio Tainted: G        W
      Hardware name: HPE Apollo 70             /C01_APACHE_MB         , BIOS
      L50_5.13_1.11 06/18/2019
      Call trace:
       dump_backtrace+0x0/0x248
       show_stack+0x20/0x2c
       dump_stack+0xe8/0x150
       print_circular_bug+0x368/0x380
       check_noncircular+0x28c/0x294
       validate_chain+0xf6c/0x2e2c
       __lock_acquire+0x868/0xc2c
       lock_acquire+0x320/0x360
       _raw_spin_lock+0x64/0x80
       rmqueue+0x138/0x2050
       get_page_from_freelist+0x474/0x688
       __alloc_pages_nodemask+0x3b4/0x18dc
       alloc_pages_current+0xd0/0xe0
       alloc_slab_page+0x2b4/0x5e0
       new_slab+0xc8/0x6bc
       ___slab_alloc+0x3b8/0x640
       kmem_cache_alloc+0x4b4/0x588
       __debug_object_init+0x778/0x8b4
       debug_object_init_on_stack+0x40/0x50
       start_flush_work+0x16c/0x3f0
       __flush_work+0xb8/0x124
       flush_work+0x20/0x30
       xlog_cil_force_lsn+0x88/0x204 [xfs]
       xfs_log_force_lsn+0x128/0x1b8 [xfs]
       xfs_file_fsync+0x3c4/0x488 [xfs]
       vfs_fsync_range+0xb0/0xd0
       generic_write_sync+0x80/0xa0 [xfs]
       xfs_file_buffered_aio_write+0x66c/0x6e4 [xfs]
       xfs_file_write_iter+0x1a0/0x218 [xfs]
       __vfs_write+0x1cc/0x214
       vfs_write+0x12c/0x1a4
       ksys_write+0xb0/0x120
       __arm64_sys_write+0x54/0x88
       el0_svc_handler+0x170/0x240
       el0_sync_handler+0x150/0x250
       el0_sync+0x164/0x180
      Reviewed-by: default avatarSergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
      Signed-off-by: default avatarQian Cai <cai@lca.pw>
      Link: https://lore.kernel.org/r/1573679785-21068-1-git-send-email-cai@lca.pwSigned-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      15341b1d
    • Jacob Pan's avatar
      iommu/vt-d: Fix off-by-one in PASID allocation · 4802b257
      Jacob Pan authored
      [ Upstream commit 39d630e3 ]
      
      PASID allocator uses IDR which is exclusive for the end of the
      allocation range. There is no need to decrement pasid_max.
      
      Fixes: af395073 ("iommu/vt-d: Apply global PASID in SVA")
      Reported-by: default avatarEric Auger <eric.auger@redhat.com>
      Signed-off-by: default avatarJacob Pan <jacob.jun.pan@linux.intel.com>
      Reviewed-by: default avatarEric Auger <eric.auger@redhat.com>
      Signed-off-by: default avatarLu Baolu <baolu.lu@linux.intel.com>
      Signed-off-by: default avatarJoerg Roedel <jroedel@suse.de>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      4802b257
    • Jia-Ju Bai's avatar
      gpio: gpio-grgpio: fix possible sleep-in-atomic-context bugs in grgpio_irq_map/unmap() · 442b50c0
      Jia-Ju Bai authored
      [ Upstream commit e36eaf94 ]
      
      The driver may sleep while holding a spinlock.
      The function call path (from bottom to top) in Linux 4.19 is:
      
      drivers/gpio/gpio-grgpio.c, 261:
      	request_irq in grgpio_irq_map
      drivers/gpio/gpio-grgpio.c, 255:
      	_raw_spin_lock_irqsave in grgpio_irq_map
      
      drivers/gpio/gpio-grgpio.c, 318:
      	free_irq in grgpio_irq_unmap
      drivers/gpio/gpio-grgpio.c, 299:
      	_raw_spin_lock_irqsave in grgpio_irq_unmap
      
      request_irq() and free_irq() can sleep at runtime.
      
      To fix these bugs, request_irq() and free_irq() are called without
      holding the spinlock.
      
      These bugs are found by a static analysis tool STCheck written by myself.
      Signed-off-by: default avatarJia-Ju Bai <baijiaju1990@gmail.com>
      Link: https://lore.kernel.org/r/20191218132605.10594-1-baijiaju1990@gmail.comSigned-off-by: default avatarLinus Walleij <linus.walleij@linaro.org>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      442b50c0
    • Oliver O'Halloran's avatar
      powerpc/powernv/iov: Ensure the pdn for VFs always contains a valid PE number · 67f7f0c7
      Oliver O'Halloran authored
      [ Upstream commit 3b5b9997 ]
      
      On pseries there is a bug with adding hotplugged devices to an IOMMU
      group. For a number of dumb reasons fixing that bug first requires
      re-working how VFs are configured on PowerNV. For background, on
      PowerNV we use the pcibios_sriov_enable() hook to do two things:
      
        1. Create a pci_dn structure for each of the VFs, and
        2. Configure the PHB's internal BARs so the MMIO range for each VF
           maps to a unique PE.
      
      Roughly speaking a PE is the hardware counterpart to a Linux IOMMU
      group since all the devices in a PE share the same IOMMU table. A PE
      also defines the set of devices that should be isolated in response to
      a PCI error (i.e. bad DMA, UR/CA, AER events, etc). When isolated all
      MMIO and DMA traffic to and from devicein the PE is blocked by the
      root complex until the PE is recovered by the OS.
      
      The requirement to block MMIO causes a giant headache because the P8
      PHB generally uses a fixed mapping between MMIO addresses and PEs. As
      a result we need to delay configuring the IOMMU groups for device
      until after MMIO resources are assigned. For physical devices (i.e.
      non-VFs) the PE assignment is done in pcibios_setup_bridge() which is
      called immediately after the MMIO resources for downstream
      devices (and the bridge's windows) are assigned. For VFs the setup is
      more complicated because:
      
        a) pcibios_setup_bridge() is not called again when VFs are activated, and
        b) The pci_dev for VFs are created by generic code which runs after
           pcibios_sriov_enable() is called.
      
      The work around for this is a two step process:
      
        1. A fixup in pcibios_add_device() is used to initialised the cached
           pe_number in pci_dn, then
        2. A bus notifier then adds the device to the IOMMU group for the PE
           specified in pci_dn->pe_number.
      
      A side effect fixing the pseries bug mentioned in the first paragraph
      is moving the fixup out of pcibios_add_device() and into
      pcibios_bus_add_device(), which is called much later. This results in
      step 2. failing because pci_dn->pe_number won't be initialised when
      the bus notifier is run.
      
      We can fix this by removing the need for the fixup. The PE for a VF is
      known before the VF is even scanned so we can initialise
      pci_dn->pe_number pcibios_sriov_enable() instead. Unfortunately,
      moving the initialisation causes two problems:
      
        1. We trip the WARN_ON() in the current fixup code, and
        2. The EEH core clears pdn->pe_number when recovering a VF and
           relies on the fixup to correctly re-set it.
      
      The only justification for either of these is a comment in
      eeh_rmv_device() suggesting that pdn->pe_number *must* be set to
      IODA_INVALID_PE in order for the VF to be scanned. However, this
      comment appears to have no basis in reality. Both bugs can be fixed by
      just deleting the code.
      Tested-by: default avatarAlexey Kardashevskiy <aik@ozlabs.ru>
      Reviewed-by: default avatarAlexey Kardashevskiy <aik@ozlabs.ru>
      Signed-off-by: default avatarOliver O'Halloran <oohall@gmail.com>
      Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
      Link: https://lore.kernel.org/r/20191028085424.12006-1-oohall@gmail.comSigned-off-by: default avatarSasha Levin <sashal@kernel.org>
      67f7f0c7
    • Eugen Hristev's avatar
      media: i2c: mt9v032: fix enum mbus codes and frame sizes · 03ac6ed4
      Eugen Hristev authored
      [ Upstream commit 1451d5ae ]
      
      This driver supports both the mt9v032 (color) and the mt9v022 (mono)
      sensors. Depending on which sensor is used, the format from the sensor is
      different. The format.code inside the dev struct holds this information.
      The enum mbus and enum frame sizes need to take into account both type of
      sensors, not just the color one. To solve this, use the format.code in
      these functions instead of the hardcoded bayer color format (which is only
      used for mt9v032).
      
      [Sakari Ailus: rewrapped commit message]
      Suggested-by: default avatarWenyou Yang <wenyou.yang@microchip.com>
      Signed-off-by: default avatarEugen Hristev <eugen.hristev@microchip.com>
      Reviewed-by: default avatarLaurent Pinchart <laurent.pinchart@ideasonboard.com>
      Signed-off-by: default avatarSakari Ailus <sakari.ailus@linux.intel.com>
      Signed-off-by: default avatarMauro Carvalho Chehab <mchehab+huawei@kernel.org>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      03ac6ed4
    • Christophe JAILLET's avatar
      pxa168fb: Fix the function used to release some memory in an error handling path · 8cc5aa5c
      Christophe JAILLET authored
      [ Upstream commit 3c911fe7 ]
      
      In the probe function, some resources are allocated using 'dma_alloc_wc()',
      they should be released with 'dma_free_wc()', not 'dma_free_coherent()'.
      
      We already use 'dma_free_wc()' in the remove function, but not in the
      error handling path of the probe function.
      
      Also, remove a useless 'PAGE_ALIGN()'. 'info->fix.smem_len' is already
      PAGE_ALIGNed.
      
      Fixes: 638772c7 ("fb: add support of LCD display controller on pxa168/910 (base layer)")
      Signed-off-by: default avatarChristophe JAILLET <christophe.jaillet@wanadoo.fr>
      Reviewed-by: default avatarLubomir Rintel <lkundrak@v3.sk>
      CC: YueHaibing <yuehaibing@huawei.com>
      Signed-off-by: default avatarBartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
      Link: https://patchwork.freedesktop.org/patch/msgid/20190831100024.3248-1-christophe.jaillet@wanadoo.frSigned-off-by: default avatarSasha Levin <sashal@kernel.org>
      8cc5aa5c
    • Geert Uytterhoeven's avatar
      pinctrl: sh-pfc: sh7264: Fix CAN function GPIOs · e5c8d49b
      Geert Uytterhoeven authored
      [ Upstream commit 55b1cb1f ]
      
      pinmux_func_gpios[] contains a hole due to the missing function GPIO
      definition for the "CTX0&CTX1" signal, which is the logical "AND" of the
      two CAN outputs.
      
      Fix this by:
        - Renaming CRX0_CRX1_MARK to CTX0_CTX1_MARK, as PJ2MD[2:0]=010
          configures the combined "CTX0&CTX1" output signal,
        - Renaming CRX0X1_MARK to CRX0_CRX1_MARK, as PJ3MD[1:0]=10 configures
          the shared "CRX0/CRX1" input signal, which is fed to both CAN
          inputs,
        - Adding the missing function GPIO definition for "CTX0&CTX1" to
          pinmux_func_gpios[],
        - Moving all CAN enums next to each other.
      
      See SH7262 Group, SH7264 Group User's Manual: Hardware, Rev. 4.00:
        [1] Figure 1.2 (3) (Pin Assignment for the SH7264 Group (1-Mbyte
            Version),
        [2] Figure 1.2 (4) Pin Assignment for the SH7264 Group (640-Kbyte
            Version,
        [3] Table 1.4 List of Pins,
        [4] Figure 20.29 Connection Example when Using This Module as 1-Channel
            Module (64 Mailboxes x 1 Channel),
        [5] Table 32.10 Multiplexed Pins (Port J),
        [6] Section 32.2.30 (3) Port J Control Register 0 (PJCR0).
      
      Note that the last 2 disagree about PJ2MD[2:0], which is probably the
      root cause of this bug.  But considering [4], "CTx0&CTx1" in [5] must
      be correct, and "CRx0&CRx1" in [6] must be wrong.
      Signed-off-by: default avatarGeert Uytterhoeven <geert+renesas@glider.be>
      Link: https://lore.kernel.org/r/20191218194812.12741-4-geert+renesas@glider.beSigned-off-by: default avatarSasha Levin <sashal@kernel.org>
      e5c8d49b
    • Vladimir Oltean's avatar
      gianfar: Fix TX timestamping with a stacked DSA driver · 195e54e6
      Vladimir Oltean authored
      [ Upstream commit c26a2c2d ]
      
      The driver wrongly assumes that it is the only entity that can set the
      SKBTX_IN_PROGRESS bit of the current skb. Therefore, in the
      gfar_clean_tx_ring function, where the TX timestamp is collected if
      necessary, the aforementioned bit is used to discriminate whether or not
      the TX timestamp should be delivered to the socket's error queue.
      
      But a stacked driver such as a DSA switch can also set the
      SKBTX_IN_PROGRESS bit, which is actually exactly what it should do in
      order to denote that the hardware timestamping process is undergoing.
      
      Therefore, gianfar would misinterpret the "in progress" bit as being its
      own, and deliver a second skb clone in the socket's error queue,
      completely throwing off a PTP process which is not expecting to receive
      it, _even though_ TX timestamping is not enabled for gianfar.
      
      There have been discussions [0] as to whether non-MAC drivers need or
      not to set SKBTX_IN_PROGRESS at all (whose purpose is to avoid sending 2
      timestamps, a sw and a hw one, to applications which only expect one).
      But as of this patch, there are at least 2 PTP drivers that would break
      in conjunction with gianfar: the sja1105 DSA switch and the felix
      switch, by way of its ocelot core driver.
      
      So regardless of that conclusion, fix the gianfar driver to not do stuff
      based on flags set by others and not intended for it.
      
      [0]: https://www.spinics.net/lists/netdev/msg619699.html
      
      Fixes: f0ee7acf ("gianfar: Add hardware TX timestamping support")
      Signed-off-by: default avatarVladimir Oltean <olteanv@gmail.com>
      Acked-by: default avatarRichard Cochran <richardcochran@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      195e54e6
    • Takashi Sakamoto's avatar
      ALSA: ctl: allow TLV read operation for callback type of element in locked case · 2dbae70b
      Takashi Sakamoto authored
      [ Upstream commit d61fe22c ]
      
      A design of ALSA control core allows applications to execute three
      operations for TLV feature; read, write and command. Furthermore, it
      allows driver developers to process the operations by two ways; allocated
      array or callback function. In the former, read operation is just allowed,
      thus developers uses the latter when device driver supports variety of
      models or the target model is expected to dynamically change information
      stored in TLV container.
      
      The core also allows applications to lock any element so that the other
      applications can't perform write operation to the element for element
      value and TLV information. When the element is locked, write and command
      operation for TLV information are prohibited as well as element value.
      Any read operation should be allowed in the case.
      
      At present, when an element has callback function for TLV information,
      TLV read operation returns EPERM if the element is locked. On the
      other hand, the read operation is success when an element has allocated
      array for TLV information. In both cases, read operation is success for
      element value expectedly.
      
      This commit fixes the bug. This change can be backported to v4.14
      kernel or later.
      Signed-off-by: default avatarTakashi Sakamoto <o-takashi@sakamocchi.jp>
      Reviewed-by: default avatarJaroslav Kysela <perex@perex.cz>
      Link: https://lore.kernel.org/r/20191223093347.15279-1-o-takashi@sakamocchi.jpSigned-off-by: default avatarTakashi Iwai <tiwai@suse.de>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      2dbae70b
    • Ritesh Harjani's avatar
      ext4: fix ext4_dax_read/write inode locking sequence for IOCB_NOWAIT · 428bb08a
      Ritesh Harjani authored
      [ Upstream commit f629afe3 ]
      
      Apparently our current rwsem code doesn't like doing the trylock, then
      lock for real scheme.  So change our dax read/write methods to just do the
      trylock for the RWF_NOWAIT case.
      This seems to fix AIM7 regression in some scalable filesystems upto ~25%
      in some cases. Claimed in commit 942491c9 ("xfs: fix AIM7 regression")
      Reviewed-by: default avatarJan Kara <jack@suse.cz>
      Reviewed-by: default avatarMatthew Bobrowski <mbobrowski@mbobrowski.org>
      Tested-by: default avatarJoseph Qi <joseph.qi@linux.alibaba.com>
      Signed-off-by: default avatarRitesh Harjani <riteshh@linux.ibm.com>
      Link: https://lore.kernel.org/r/20191212055557.11151-2-riteshh@linux.ibm.comSigned-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      428bb08a
    • Zahari Petkov's avatar
      leds: pca963x: Fix open-drain initialization · 44d748f2
      Zahari Petkov authored
      [ Upstream commit 69752909 ]
      
      Before commit bb29b9cc ("leds: pca963x: Add bindings to invert
      polarity") Mode register 2 was initialized directly with either 0x01
      or 0x05 for open-drain or totem pole (push-pull) configuration.
      
      Afterwards, MODE2 initialization started using bitwise operations on
      top of the default MODE2 register value (0x05). Using bitwise OR for
      setting OUTDRV with 0x01 and 0x05 does not produce correct results.
      When open-drain is used, instead of setting OUTDRV to 0, the driver
      keeps it as 1:
      
      Open-drain: 0x05 | 0x01 -> 0x05 (0b101 - incorrect)
      Totem pole: 0x05 | 0x05 -> 0x05 (0b101 - correct but still wrong)
      
      Now OUTDRV setting uses correct bitwise operations for initialization:
      
      Open-drain: 0x05 & ~0x04 -> 0x01 (0b001 - correct)
      Totem pole: 0x05 | 0x04 -> 0x05 (0b101 - correct)
      
      Additional MODE2 register definitions are introduced now as well.
      
      Fixes: bb29b9cc ("leds: pca963x: Add bindings to invert polarity")
      Signed-off-by: default avatarZahari Petkov <zahari@balena.io>
      Signed-off-by: default avatarPavel Machek <pavel@ucw.cz>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      44d748f2
    • Dan Carpenter's avatar
      brcmfmac: Fix use after free in brcmf_sdio_readframes() · ead1cee8
      Dan Carpenter authored
      [ Upstream commit 216b4400 ]
      
      The brcmu_pkt_buf_free_skb() function frees "pkt" so it leads to a
      static checker warning:
      
          drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c:1974 brcmf_sdio_readframes()
          error: dereferencing freed memory 'pkt'
      
      It looks like there was supposed to be a continue after we free "pkt".
      
      Fixes: 4754fcee ("brcmfmac: streamline SDIO read frame routine")
      Signed-off-by: default avatarDan Carpenter <dan.carpenter@oracle.com>
      Acked-by: default avatarFranky Lin <franky.lin@broadcom.com>
      Signed-off-by: default avatarKalle Valo <kvalo@codeaurora.org>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      ead1cee8
    • Peter Zijlstra's avatar
      cpu/hotplug, stop_machine: Fix stop_machine vs hotplug order · b9dc4d61
      Peter Zijlstra authored
      [ Upstream commit 45178ac0 ]
      
      Paul reported a very sporadic, rcutorture induced, workqueue failure.
      When the planets align, the workqueue rescuer's self-migrate fails and
      then triggers a WARN for running a work on the wrong CPU.
      
      Tejun then figured that set_cpus_allowed_ptr()'s stop_one_cpu() call
      could be ignored! When stopper->enabled is false, stop_machine will
      insta complete the work, without actually doing the work. Worse, it
      will not WARN about this (we really should fix this).
      
      It turns out there is a small window where a freshly online'ed CPU is
      marked 'online' but doesn't yet have the stopper task running:
      
      	BP				AP
      
      	bringup_cpu()
      	  __cpu_up(cpu, idle)	 -->	start_secondary()
      					...
      					cpu_startup_entry()
      	  bringup_wait_for_ap()
      	    wait_for_ap_thread() <--	  cpuhp_online_idle()
      					  while (1)
      					    do_idle()
      
      					... available to run kthreads ...
      
      	    stop_machine_unpark()
      	      stopper->enable = true;
      
      Close this by moving the stop_machine_unpark() into
      cpuhp_online_idle(), such that the stopper thread is ready before we
      start the idle loop and schedule.
      Reported-by: default avatar"Paul E. McKenney" <paulmck@kernel.org>
      Debugged-by: default avatarTejun Heo <tj@kernel.org>
      Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
      Tested-by: default avatar"Paul E. McKenney" <paulmck@kernel.org>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      b9dc4d61
    • Paul Kocialkowski's avatar
      drm/gma500: Fixup fbdev stolen size usage evaluation · 5d358e7e
      Paul Kocialkowski authored
      [ Upstream commit fd1a5e52 ]
      
      psbfb_probe performs an evaluation of the required size from the stolen
      GTT memory, but gets it wrong in two distinct ways:
      - The resulting size must be page-size-aligned;
      - The size to allocate is derived from the surface dimensions, not the fb
        dimensions.
      
      When two connectors are connected with different modes, the smallest will
      be stored in the fb dimensions, but the size that needs to be allocated must
      match the largest (surface) dimensions. This is what is used in the actual
      allocation code.
      
      Fix this by correcting the evaluation to conform to the two points above.
      It allows correctly switching to 16bpp when one connector is e.g. 1920x1080
      and the other is 1024x768.
      Signed-off-by: default avatarPaul Kocialkowski <paul.kocialkowski@bootlin.com>
      Signed-off-by: default avatarPatrik Jakobsson <patrik.r.jakobsson@gmail.com>
      Link: https://patchwork.freedesktop.org/patch/msgid/20191107153048.843881-1-paul.kocialkowski@bootlin.comSigned-off-by: default avatarSasha Levin <sashal@kernel.org>
      5d358e7e
    • Sean Christopherson's avatar
      KVM: nVMX: Use correct root level for nested EPT shadow page tables · 2130de7d
      Sean Christopherson authored
      [ Upstream commit 148d735e ]
      
      Hardcode the EPT page-walk level for L2 to be 4 levels, as KVM's MMU
      currently also hardcodes the page walk level for nested EPT to be 4
      levels.  The L2 guest is all but guaranteed to soft hang on its first
      instruction when L1 is using EPT, as KVM will construct 4-level page
      tables and then tell hardware to use 5-level page tables.
      
      Fixes: 855feb67 ("KVM: MMU: Add 5 level EPT & Shadow page table support.")
      Cc: stable@vger.kernel.org
      Signed-off-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
      Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      2130de7d
    • Sasha Levin's avatar
      Revert "KVM: VMX: Add non-canonical check on writes to RTIT address MSRs" · 9c270ce3
      Sasha Levin authored
      This reverts commit 57211b73.
      
      This patch isn't needed on 4.19 and older.
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      9c270ce3
    • Sasha Levin's avatar
      249387d7
    • Davide Caratti's avatar
      net/sched: flower: add missing validation of TCA_FLOWER_FLAGS · e2eb6f22
      Davide Caratti authored
      [ Upstream commit e2debf08 ]
      
      unlike other classifiers that can be offloaded (i.e. users can set flags
      like 'skip_hw' and 'skip_sw'), 'cls_flower' doesn't validate the size of
      netlink attribute 'TCA_FLOWER_FLAGS' provided by user: add a proper entry
      to fl_policy.
      
      Fixes: 5b33f488 ("net/flower: Introduce hardware offload support")
      Signed-off-by: default avatarDavide Caratti <dcaratti@redhat.com>
      Acked-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      e2eb6f22
    • Davide Caratti's avatar
      net/sched: matchall: add missing validation of TCA_MATCHALL_FLAGS · 6752ae60
      Davide Caratti authored
      [ Upstream commit 1afa3cc9 ]
      
      unlike other classifiers that can be offloaded (i.e. users can set flags
      like 'skip_hw' and 'skip_sw'), 'cls_matchall' doesn't validate the size
      of netlink attribute 'TCA_MATCHALL_FLAGS' provided by user: add a proper
      entry to mall_policy.
      
      Fixes: b87f7936 ("net/sched: Add match-all classifier hw offloading.")
      Signed-off-by: default avatarDavide Caratti <dcaratti@redhat.com>
      Acked-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      6752ae60
    • Per Forlin's avatar
      net: dsa: tag_qca: Make sure there is headroom for tag · d1e0f10e
      Per Forlin authored
      [ Upstream commit 04fb9124 ]
      
      Passing tag size to skb_cow_head will make sure
      there is enough headroom for the tag data.
      This change does not introduce any overhead in case there
      is already available headroom for tag.
      Signed-off-by: default avatarPer Forlin <perfn@axis.com>
      Reviewed-by: default avatarFlorian Fainelli <f.fainelli@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      d1e0f10e
    • Eric Dumazet's avatar
      net/smc: fix leak of kernel memory to user space · 421ab411
      Eric Dumazet authored
      [ Upstream commit 457fed77 ]
      
      As nlmsg_put() does not clear the memory that is reserved,
      it this the caller responsability to make sure all of this
      memory will be written, in order to not reveal prior content.
      
      While we are at it, we can provide the socket cookie even
      if clsock is not set.
      
      syzbot reported :
      
      BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
      BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline]
      BUG: KMSAN: uninit-value in __swab32p include/uapi/linux/swab.h:179 [inline]
      BUG: KMSAN: uninit-value in __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline]
      BUG: KMSAN: uninit-value in get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline]
      BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline]
      BUG: KMSAN: uninit-value in ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline]
      BUG: KMSAN: uninit-value in bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252
      CPU: 1 PID: 5262 Comm: syz-executor.5 Not tainted 5.5.0-rc5-syzkaller #0
      Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
      Call Trace:
       __dump_stack lib/dump_stack.c:77 [inline]
       dump_stack+0x1c9/0x220 lib/dump_stack.c:118
       kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:118
       __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215
       __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline]
       __fswab32 include/uapi/linux/swab.h:59 [inline]
       __swab32p include/uapi/linux/swab.h:179 [inline]
       __be32_to_cpup include/uapi/linux/byteorder/little_endian.h:82 [inline]
       get_unaligned_be32 include/linux/unaligned/access_ok.h:30 [inline]
       ____bpf_skb_load_helper_32 net/core/filter.c:240 [inline]
       ____bpf_skb_load_helper_32_no_cache net/core/filter.c:255 [inline]
       bpf_skb_load_helper_32_no_cache+0x14a/0x390 net/core/filter.c:252
      
      Uninit was created at:
       kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline]
       kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127
       kmsan_kmalloc_large+0x73/0xc0 mm/kmsan/kmsan_hooks.c:128
       kmalloc_large_node_hook mm/slub.c:1406 [inline]
       kmalloc_large_node+0x282/0x2c0 mm/slub.c:3841
       __kmalloc_node_track_caller+0x44b/0x1200 mm/slub.c:4368
       __kmalloc_reserve net/core/skbuff.c:141 [inline]
       __alloc_skb+0x2fd/0xac0 net/core/skbuff.c:209
       alloc_skb include/linux/skbuff.h:1049 [inline]
       netlink_dump+0x44b/0x1ab0 net/netlink/af_netlink.c:2224
       __netlink_dump_start+0xbb2/0xcf0 net/netlink/af_netlink.c:2352
       netlink_dump_start include/linux/netlink.h:233 [inline]
       smc_diag_handler_dump+0x2ba/0x300 net/smc/smc_diag.c:242
       sock_diag_rcv_msg+0x211/0x610 net/core/sock_diag.c:256
       netlink_rcv_skb+0x451/0x650 net/netlink/af_netlink.c:2477
       sock_diag_rcv+0x63/0x80 net/core/sock_diag.c:275
       netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
       netlink_unicast+0xf9e/0x1100 net/netlink/af_netlink.c:1328
       netlink_sendmsg+0x1248/0x14d0 net/netlink/af_netlink.c:1917
       sock_sendmsg_nosec net/socket.c:639 [inline]
       sock_sendmsg net/socket.c:659 [inline]
       kernel_sendmsg+0x433/0x440 net/socket.c:679
       sock_no_sendpage+0x235/0x300 net/core/sock.c:2740
       kernel_sendpage net/socket.c:3776 [inline]
       sock_sendpage+0x1e1/0x2c0 net/socket.c:937
       pipe_to_sendpage+0x38c/0x4c0 fs/splice.c:458
       splice_from_pipe_feed fs/splice.c:512 [inline]
       __splice_from_pipe+0x539/0xed0 fs/splice.c:636
       splice_from_pipe fs/splice.c:671 [inline]
       generic_splice_sendpage+0x1d5/0x2d0 fs/splice.c:844
       do_splice_from fs/splice.c:863 [inline]
       do_splice fs/splice.c:1170 [inline]
       __do_sys_splice fs/splice.c:1447 [inline]
       __se_sys_splice+0x2380/0x3350 fs/splice.c:1427
       __x64_sys_splice+0x6e/0x90 fs/splice.c:1427
       do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:296
       entry_SYSCALL_64_after_hwframe+0x44/0xa9
      
      Fixes: f16a7dd5 ("smc: netlink interface for SMC sockets")
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      421ab411
    • Firo Yang's avatar
      enic: prevent waking up stopped tx queues over watchdog reset · 150f8c56
      Firo Yang authored
      [ Upstream commit 0f905225 ]
      
      Recent months, our customer reported several kernel crashes all
      preceding with following message:
      NETDEV WATCHDOG: eth2 (enic): transmit queue 0 timed out
      Error message of one of those crashes:
      BUG: unable to handle kernel paging request at ffffffffa007e090
      
      After analyzing severl vmcores, I found that most of crashes are
      caused by memory corruption. And all the corrupted memory areas
      are overwritten by data of network packets. Moreover, I also found
      that the tx queues were enabled over watchdog reset.
      
      After going through the source code, I found that in enic_stop(),
      the tx queues stopped by netif_tx_disable() could be woken up over
      a small time window between netif_tx_disable() and the
      napi_disable() by the following code path:
      napi_poll->
        enic_poll_msix_wq->
           vnic_cq_service->
              enic_wq_service->
                 netif_wake_subqueue(enic->netdev, q_number)->
                    test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)
      In turn, upper netowrk stack could queue skb to ENIC NIC though
      enic_hard_start_xmit(). And this might introduce some race condition.
      
      Our customer comfirmed that this kind of kernel crash doesn't occur over
      90 days since they applied this patch.
      Signed-off-by: default avatarFiro Yang <firo.yang@suse.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      150f8c56
    • Toke Høiland-Jørgensen's avatar
      core: Don't skip generic XDP program execution for cloned SKBs · ce754a31
      Toke Høiland-Jørgensen authored
      [ Upstream commit ad1e03b2 ]
      
      The current generic XDP handler skips execution of XDP programs entirely if
      an SKB is marked as cloned. This leads to some surprising behaviour, as
      packets can end up being cloned in various ways, which will make an XDP
      program not see all the traffic on an interface.
      
      This was discovered by a simple test case where an XDP program that always
      returns XDP_DROP is installed on a veth device. When combining this with
      the Scapy packet sniffer (which uses an AF_PACKET) socket on the sending
      side, SKBs reliably end up in the cloned state, causing them to be passed
      through to the receiving interface instead of being dropped. A minimal
      reproducer script for this is included below.
      
      This patch fixed the issue by simply triggering the existing linearisation
      code for cloned SKBs instead of skipping the XDP program execution. This
      behaviour is in line with the behaviour of the native XDP implementation
      for the veth driver, which will reallocate and copy the SKB data if the SKB
      is marked as shared.
      
      Reproducer Python script (requires BCC and Scapy):
      
      from scapy.all import TCP, IP, Ether, sendp, sniff, AsyncSniffer, Raw, UDP
      from bcc import BPF
      import time, sys, subprocess, shlex
      
      SKB_MODE = (1 << 1)
      DRV_MODE = (1 << 2)
      PYTHON=sys.executable
      
      def client():
          time.sleep(2)
          # Sniffing on the sender causes skb_cloned() to be set
          s = AsyncSniffer()
          s.start()
      
          for p in range(10):
              sendp(Ether(dst="aa:aa:aa:aa:aa:aa", src="cc:cc:cc:cc:cc:cc")/IP()/UDP()/Raw("Test"),
                    verbose=False)
              time.sleep(0.1)
      
          s.stop()
          return 0
      
      def server(mode):
          prog = BPF(text="int dummy_drop(struct xdp_md *ctx) {return XDP_DROP;}")
          func = prog.load_func("dummy_drop", BPF.XDP)
          prog.attach_xdp("a_to_b", func, mode)
      
          time.sleep(1)
      
          s = sniff(iface="a_to_b", count=10, timeout=15)
          if len(s):
              print(f"Got {len(s)} packets - should have gotten 0")
              return 1
          else:
              print("Got no packets - as expected")
              return 0
      
      if len(sys.argv) < 2:
          print(f"Usage: {sys.argv[0]} <skb|drv>")
          sys.exit(1)
      
      if sys.argv[1] == "client":
          sys.exit(client())
      elif sys.argv[1] == "server":
          mode = SKB_MODE if sys.argv[2] == 'skb' else DRV_MODE
          sys.exit(server(mode))
      else:
          try:
              mode = sys.argv[1]
              if mode not in ('skb', 'drv'):
                  print(f"Usage: {sys.argv[0]} <skb|drv>")
                  sys.exit(1)
              print(f"Running in {mode} mode")
      
              for cmd in [
                      'ip netns add netns_a',
                      'ip netns add netns_b',
                      'ip -n netns_a link add a_to_b type veth peer name b_to_a netns netns_b',
                      # Disable ipv6 to make sure there's no address autoconf traffic
                      'ip netns exec netns_a sysctl -qw net.ipv6.conf.a_to_b.disable_ipv6=1',
                      'ip netns exec netns_b sysctl -qw net.ipv6.conf.b_to_a.disable_ipv6=1',
                      'ip -n netns_a link set dev a_to_b address aa:aa:aa:aa:aa:aa',
                      'ip -n netns_b link set dev b_to_a address cc:cc:cc:cc:cc:cc',
                      'ip -n netns_a link set dev a_to_b up',
                      'ip -n netns_b link set dev b_to_a up']:
                  subprocess.check_call(shlex.split(cmd))
      
              server = subprocess.Popen(shlex.split(f"ip netns exec netns_a {PYTHON} {sys.argv[0]} server {mode}"))
              client = subprocess.Popen(shlex.split(f"ip netns exec netns_b {PYTHON} {sys.argv[0]} client"))
      
              client.wait()
              server.wait()
              sys.exit(server.returncode)
      
          finally:
              subprocess.run(shlex.split("ip netns delete netns_a"))
              subprocess.run(shlex.split("ip netns delete netns_b"))
      
      Fixes: d4455169 ("net: xdp: support xdp generic on virtual devices")
      Reported-by: default avatarStepan Horacek <shoracek@redhat.com>
      Suggested-by: default avatarPaolo Abeni <pabeni@redhat.com>
      Signed-off-by: default avatarToke Høiland-Jørgensen <toke@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ce754a31
  2. 19 Feb, 2020 17 commits
    • Greg Kroah-Hartman's avatar
      Linux 4.19.105 · 4fccc250
      Greg Kroah-Hartman authored
      4fccc250
    • Sean Christopherson's avatar
      KVM: x86/mmu: Fix struct guest_walker arrays for 5-level paging · e39cc4b0
      Sean Christopherson authored
      [ Upstream commit f6ab0107 ]
      
      Define PT_MAX_FULL_LEVELS as PT64_ROOT_MAX_LEVEL, i.e. 5, to fix shadow
      paging for 5-level guest page tables.  PT_MAX_FULL_LEVELS is used to
      size the arrays that track guest pages table information, i.e. using a
      "max levels" of 4 causes KVM to access garbage beyond the end of an
      array when querying state for level 5 entries.  E.g. FNAME(gpte_changed)
      will read garbage and most likely return %true for a level 5 entry,
      soft-hanging the guest because FNAME(fetch) will restart the guest
      instead of creating SPTEs because it thinks the guest PTE has changed.
      
      Note, KVM doesn't yet support 5-level nested EPT, so PT_MAX_FULL_LEVELS
      gets to stay "4" for the PTTYPE_EPT case.
      
      Fixes: 855feb67 ("KVM: MMU: Add 5 level EPT & Shadow page table support.")
      Cc: stable@vger.kernel.org
      Signed-off-by: default avatarSean Christopherson <sean.j.christopherson@intel.com>
      Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      e39cc4b0
    • zhangyi (F)'s avatar
      jbd2: do not clear the BH_Mapped flag when forgetting a metadata buffer · 2a3cf355
      zhangyi (F) authored
      [ Upstream commit c96dceea ]
      
      Commit 904cdbd4 ("jbd2: clear dirty flag when revoking a buffer from
      an older transaction") set the BH_Freed flag when forgetting a metadata
      buffer which belongs to the committing transaction, it indicate the
      committing process clear dirty bits when it is done with the buffer. But
      it also clear the BH_Mapped flag at the same time, which may trigger
      below NULL pointer oops when block_size < PAGE_SIZE.
      
      rmdir 1             kjournald2                 mkdir 2
                          jbd2_journal_commit_transaction
      		    commit transaction N
      jbd2_journal_forget
      set_buffer_freed(bh1)
                          jbd2_journal_commit_transaction
                           commit transaction N+1
                           ...
                           clear_buffer_mapped(bh1)
                                                     ext4_getblk(bh2 ummapped)
                                                     ...
                                                     grow_dev_page
                                                      init_page_buffers
                                                       bh1->b_private=NULL
                                                       bh2->b_private=NULL
                           jbd2_journal_put_journal_head(jh1)
                            __journal_remove_journal_head(hb1)
      		       jh1 is NULL and trigger oops
      
      *) Dir entry block bh1 and bh2 belongs to one page, and the bh2 has
         already been unmapped.
      
      For the metadata buffer we forgetting, we should always keep the mapped
      flag and clear the dirty flags is enough, so this patch pick out the
      these buffers and keep their BH_Mapped flag.
      
      Link: https://lore.kernel.org/r/20200213063821.30455-3-yi.zhang@huawei.com
      Fixes: 904cdbd4 ("jbd2: clear dirty flag when revoking a buffer from an older transaction")
      Reviewed-by: default avatarJan Kara <jack@suse.cz>
      Signed-off-by: default avatarzhangyi (F) <yi.zhang@huawei.com>
      Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
      Cc: stable@kernel.org
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      2a3cf355
    • zhangyi (F)'s avatar
      jbd2: move the clearing of b_modified flag to the journal_unmap_buffer() · 056c7c22
      zhangyi (F) authored
      [ Upstream commit 6a66a7de ]
      
      There is no need to delay the clearing of b_modified flag to the
      transaction committing time when unmapping the journalled buffer, so
      just move it to the journal_unmap_buffer().
      
      Link: https://lore.kernel.org/r/20200213063821.30455-2-yi.zhang@huawei.comReviewed-by: default avatarJan Kara <jack@suse.cz>
      Signed-off-by: default avatarzhangyi (F) <yi.zhang@huawei.com>
      Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
      Cc: stable@kernel.org
      Signed-off-by: default avatarSasha Levin <sashal@kernel.org>
      056c7c22
    • Olga Kornievskaia's avatar
      NFSv4.1 make cachethis=no for writes · 32865d65
      Olga Kornievskaia authored
      commit cd1b659d upstream.
      
      Turning caching off for writes on the server should improve performance.
      
      Fixes: fba83f34 ("NFS: Pass "privileged" value to nfs4_init_sequence()")
      Signed-off-by: default avatarOlga Kornievskaia <kolga@netapp.com>
      Reviewed-by: default avatarTrond Myklebust <trond.myklebust@hammerspace.com>
      Signed-off-by: default avatarAnna Schumaker <Anna.Schumaker@Netapp.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      32865d65
    • Mike Jones's avatar
      hwmon: (pmbus/ltc2978) Fix PMBus polling of MFR_COMMON definitions. · aa90c2cb
      Mike Jones authored
      commit cf2b012c upstream.
      
      Change 21537dc driver PMBus polling of MFR_COMMON from bits 5/4 to
      bits 6/5. This fixs a LTC297X family bug where polling always returns
      not busy even when the part is busy. This fixes a LTC388X and
      LTM467X bug where polling used PEND and NOT_IN_TRANS, and BUSY was
      not polled, which can lead to NACKing of commands. LTC388X and
      LTM467X modules now poll BUSY and PEND, increasing reliability by
      eliminating NACKing of commands.
      Signed-off-by: default avatarMike Jones <michael-a1.jones@analog.com>
      Link: https://lore.kernel.org/r/1580234400-2829-2-git-send-email-michael-a1.jones@analog.com
      Fixes: e04d1ce9 ("hwmon: (ltc2978) Add polling for chips requiring it")
      Signed-off-by: default avatarGuenter Roeck <linux@roeck-us.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      aa90c2cb
    • Kan Liang's avatar
      perf/x86/intel: Fix inaccurate period in context switch for auto-reload · 6f1e32c5
      Kan Liang authored
      commit f861854e upstream.
      
      Perf doesn't take the left period into account when auto-reload is
      enabled with fixed period sampling mode in context switch.
      
      Here is the MSR trace of the perf command as below.
      (The MSR trace is simplified from a ftrace log.)
      
          #perf record -e cycles:p -c 2000000 -- ./triad_loop
      
            //The MSR trace of task schedule out
            //perf disable all counters, disable PEBS, disable GP counter 0,
            //read GP counter 0, and re-enable all counters.
            //The counter 0 stops at 0xfffffff82840
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0
            write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 0
            write_msr: MSR_P6_EVNTSEL0(186), value 40003003c
            rdpmc: 0, value fffffff82840
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff
      
            //The MSR trace of the same task schedule in again
            //perf disable all counters, enable and set GP counter 0,
            //enable PEBS, and re-enable all counters.
            //0xffffffe17b80 (-2000000) is written to GP counter 0.
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0
            write_msr: MSR_IA32_PMC0(4c1), value ffffffe17b80
            write_msr: MSR_P6_EVNTSEL0(186), value 40043003c
            write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 1
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff
      
      When the same task schedule in again, the counter should starts from
      previous left. However, it starts from the fixed period -2000000 again.
      
      A special variant of intel_pmu_save_and_restart() is used for
      auto-reload, which doesn't update the hwc->period_left.
      When the monitored task schedules in again, perf doesn't know the left
      period. The fixed period is used, which is inaccurate.
      
      With auto-reload, the counter always has a negative counter value. So
      the left period is -value. Update the period_left in
      intel_pmu_save_and_restart_reload().
      
      With the patch:
      
            //The MSR trace of task schedule out
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0
            write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 0
            write_msr: MSR_P6_EVNTSEL0(186), value 40003003c
            rdpmc: 0, value ffffffe25cbc
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff
      
            //The MSR trace of the same task schedule in again
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0
            write_msr: MSR_IA32_PMC0(4c1), value ffffffe25cbc
            write_msr: MSR_P6_EVNTSEL0(186), value 40043003c
            write_msr: MSR_IA32_PEBS_ENABLE(3f1), value 1
            write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value f000000ff
      
      Fixes: d31fc13f ("perf/x86/intel: Fix event update for auto-reload")
      Signed-off-by: default avatarKan Liang <kan.liang@linux.intel.com>
      Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
      Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
      Link: https://lkml.kernel.org/r/20200121190125.3389-1-kan.liang@linux.intel.comSigned-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      6f1e32c5
    • Nathan Chancellor's avatar
      s390/time: Fix clk type in get_tod_clock · fce14b5b
      Nathan Chancellor authored
      commit 0f8a206d upstream.
      
      Clang warns:
      
      In file included from ../arch/s390/boot/startup.c:3:
      In file included from ../include/linux/elf.h:5:
      In file included from ../arch/s390/include/asm/elf.h:132:
      In file included from ../include/linux/compat.h:10:
      In file included from ../include/linux/time.h:74:
      In file included from ../include/linux/time32.h:13:
      In file included from ../include/linux/timex.h:65:
      ../arch/s390/include/asm/timex.h:160:20: warning: passing 'unsigned char
      [16]' to parameter of type 'char *' converts between pointers to integer
      types with different sign [-Wpointer-sign]
              get_tod_clock_ext(clk);
                                ^~~
      ../arch/s390/include/asm/timex.h:149:44: note: passing argument to
      parameter 'clk' here
      static inline void get_tod_clock_ext(char *clk)
                                                 ^
      
      Change clk's type to just be char so that it matches what happens in
      get_tod_clock_ext.
      
      Fixes: 57b28f66 ("[S390] s390_hypfs: Add new attributes")
      Link: https://github.com/ClangBuiltLinux/linux/issues/861
      Link: http://lkml.kernel.org/r/20200208140858.47970-1-natechancellor@gmail.comReviewed-by: default avatarNick Desaulniers <ndesaulniers@google.com>
      Signed-off-by: default avatarNathan Chancellor <natechancellor@gmail.com>
      Signed-off-by: default avatarVasily Gorbik <gor@linux.ibm.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      fce14b5b
    • Leon Romanovsky's avatar
      RDMA/core: Fix protection fault in get_pkey_idx_qp_list · 5595f492
      Leon Romanovsky authored
      commit 1dd01788 upstream.
      
      We don't need to set pkey as valid in case that user set only one of pkey
      index or port number, otherwise it will be resulted in NULL pointer
      dereference while accessing to uninitialized pkey list.  The following
      crash from Syzkaller revealed it.
      
        kasan: CONFIG_KASAN_INLINE enabled
        kasan: GPF could be caused by NULL-ptr deref or user memory access
        general protection fault: 0000 [#1] SMP KASAN PTI
        CPU: 1 PID: 14753 Comm: syz-executor.2 Not tainted 5.5.0-rc5 #2
        Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
        rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
        RIP: 0010:get_pkey_idx_qp_list+0x161/0x2d0
        Code: 01 00 00 49 8b 5e 20 4c 39 e3 0f 84 b9 00 00 00 e8 e4 42 6e fe 48
        8d 7b 10 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04
        02 84 c0 74 08 3c 01 0f 8e d0 00 00 00 48 8d 7d 04 48 b8
        RSP: 0018:ffffc9000bc6f950 EFLAGS: 00010202
        RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff82c8bdec
        RDX: 0000000000000002 RSI: ffffc900030a8000 RDI: 0000000000000010
        RBP: ffff888112c8ce80 R08: 0000000000000004 R09: fffff5200178df1f
        R10: 0000000000000001 R11: fffff5200178df1f R12: ffff888115dc4430
        R13: ffff888115da8498 R14: ffff888115dc4410 R15: ffff888115da8000
        FS:  00007f20777de700(0000) GS:ffff88811b100000(0000)
        knlGS:0000000000000000
        CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        CR2: 0000001b2f721000 CR3: 00000001173ca002 CR4: 0000000000360ee0
        DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
        Call Trace:
         port_pkey_list_insert+0xd7/0x7c0
         ib_security_modify_qp+0x6fa/0xfc0
         _ib_modify_qp+0x8c4/0xbf0
         modify_qp+0x10da/0x16d0
         ib_uverbs_modify_qp+0x9a/0x100
         ib_uverbs_write+0xaa5/0xdf0
         __vfs_write+0x7c/0x100
         vfs_write+0x168/0x4a0
         ksys_write+0xc8/0x200
         do_syscall_64+0x9c/0x390
         entry_SYSCALL_64_after_hwframe+0x44/0xa9
      
      Fixes: d291f1a6 ("IB/core: Enforce PKey security on QPs")
      Link: https://lore.kernel.org/r/20200212080651.GB679970@unrealSigned-off-by: default avatarMaor Gottlieb <maorg@mellanox.com>
      Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
      Message-Id: <20200212080651.GB679970@unreal>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      5595f492
    • Zhu Yanjun's avatar
      RDMA/rxe: Fix soft lockup problem due to using tasklets in softirq · 5fb35764
      Zhu Yanjun authored
      commit 8ac0e664 upstream.
      
      When run stress tests with RXE, the following Call Traces often occur
      
        watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [swapper/2:0]
        ...
        Call Trace:
        <IRQ>
        create_object+0x3f/0x3b0
        kmem_cache_alloc_node_trace+0x129/0x2d0
        __kmalloc_reserve.isra.52+0x2e/0x80
        __alloc_skb+0x83/0x270
        rxe_init_packet+0x99/0x150 [rdma_rxe]
        rxe_requester+0x34e/0x11a0 [rdma_rxe]
        rxe_do_task+0x85/0xf0 [rdma_rxe]
        tasklet_action_common.isra.21+0xeb/0x100
        __do_softirq+0xd0/0x298
        irq_exit+0xc5/0xd0
        smp_apic_timer_interrupt+0x68/0x120
        apic_timer_interrupt+0xf/0x20
        </IRQ>
        ...
      
      The root cause is that tasklet is actually a softirq. In a tasklet
      handler, another softirq handler is triggered. Usually these softirq
      handlers run on the same cpu core. So this will cause "soft lockup Bug".
      
      Fixes: 8700e3e7 ("Soft RoCE driver")
      Link: https://lore.kernel.org/r/20200212072635.682689-8-leon@kernel.orgSigned-off-by: default avatarZhu Yanjun <yanjunz@mellanox.com>
      Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      5fb35764
    • Kamal Heib's avatar
      RDMA/hfi1: Fix memory leak in _dev_comp_vect_mappings_create · b817c10b
      Kamal Heib authored
      commit 8a4f300b upstream.
      
      Make sure to free the allocated cpumask_var_t's to avoid the following
      reported memory leak by kmemleak:
      
      $ cat /sys/kernel/debug/kmemleak
      unreferenced object 0xffff8897f812d6a8 (size 8):
        comm "kworker/1:1", pid 347, jiffies 4294751400 (age 101.703s)
        hex dump (first 8 bytes):
          00 00 00 00 00 00 00 00                          ........
        backtrace:
          [<00000000bff49664>] alloc_cpumask_var_node+0x4c/0xb0
          [<0000000075d3ca81>] hfi1_comp_vectors_set_up+0x20f/0x800 [hfi1]
          [<0000000098d420df>] hfi1_init_dd+0x3311/0x4960 [hfi1]
          [<0000000071be7e52>] init_one+0x25e/0xf10 [hfi1]
          [<000000005483d4c2>] local_pci_probe+0xd4/0x180
          [<000000007c3cbc6e>] work_for_cpu_fn+0x51/0xa0
          [<000000001d626905>] process_one_work+0x8f0/0x17b0
          [<000000007e569e7e>] worker_thread+0x536/0xb50
          [<00000000fd39a4a5>] kthread+0x30c/0x3d0
          [<0000000056f2edb3>] ret_from_fork+0x3a/0x50
      
      Fixes: 5d18ee67 ("IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support")
      Link: https://lore.kernel.org/r/20200205110530.12129-1-kamalheib1@gmail.comSigned-off-by: default avatarKamal Heib <kamalheib1@gmail.com>
      Reviewed-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      b817c10b
    • Avihai Horon's avatar
      RDMA/core: Fix invalid memory access in spec_filter_size · 11c74276
      Avihai Horon authored
      commit a72f4ac1 upstream.
      
      Add a check that the size specified in the flow spec header doesn't cause
      an overflow when calculating the filter size, and thus prevent access to
      invalid memory.  The following crash from syzkaller revealed it.
      
        kasan: CONFIG_KASAN_INLINE enabled
        kasan: GPF could be caused by NULL-ptr deref or user memory access
        general protection fault: 0000 [#1] SMP KASAN PTI
        CPU: 1 PID: 17834 Comm: syz-executor.3 Not tainted 5.5.0-rc5 #2
        Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
        rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
        RIP: 0010:memchr_inv+0xd3/0x330
        Code: 89 f9 89 f5 83 e1 07 0f 85 f9 00 00 00 49 89 d5 49 c1 ed 03 45 85
        ed 74 6f 48 89 d9 48 b8 00 00 00 00 00 fc ff df 48 c1 e9 03 <80> 3c 01
        00 0f 85 0d 02 00 00 44 0f b6 e5 48 b8 01 01 01 01 01 01
        RSP: 0018:ffffc9000a13fa50 EFLAGS: 00010202
        RAX: dffffc0000000000 RBX: 7fff88810de9d820 RCX: 0ffff11021bd3b04
        RDX: 000000000000fff8 RSI: 0000000000000000 RDI: 7fff88810de9d820
        RBP: 0000000000000000 R08: ffff888110d69018 R09: 0000000000000009
        R10: 0000000000000001 R11: ffffed10236267cc R12: 0000000000000004
        R13: 0000000000001fff R14: ffff88810de9d820 R15: 0000000000000040
        FS:  00007f9ee0e51700(0000) GS:ffff88811b100000(0000)
        knlGS:0000000000000000
        CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        CR2: 0000000000000000 CR3: 0000000115ea0006 CR4: 0000000000360ee0
        DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
        Call Trace:
         spec_filter_size.part.16+0x34/0x50
         ib_uverbs_kern_spec_to_ib_spec_filter+0x691/0x770
         ib_uverbs_ex_create_flow+0x9ea/0x1b40
         ib_uverbs_write+0xaa5/0xdf0
         __vfs_write+0x7c/0x100
         vfs_write+0x168/0x4a0
         ksys_write+0xc8/0x200
         do_syscall_64+0x9c/0x390
         entry_SYSCALL_64_after_hwframe+0x44/0xa9
        RIP: 0033:0x465b49
        Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89
        f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
        f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
        RSP: 002b:00007f9ee0e50c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
        RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000465b49
        RDX: 00000000000003a0 RSI: 00000000200007c0 RDI: 0000000000000004
        RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000
        R10: 0000000000000000 R11: 0000000000000246 R12: 00007f9ee0e516bc
        R13: 00000000004ca2da R14: 000000000070deb8 R15: 00000000ffffffff
        Modules linked in:
        Dumping ftrace buffer:
           (ftrace buffer empty)
      
      Fixes: 94e03f11 ("IB/uverbs: Add support for flow tag")
      Link: https://lore.kernel.org/r/20200126171500.4623-1-leon@kernel.orgSigned-off-by: default avatarAvihai Horon <avihaih@mellanox.com>
      Reviewed-by: default avatarMaor Gottlieb <maorg@mellanox.com>
      Signed-off-by: default avatarLeon Romanovsky <leonro@mellanox.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      11c74276
    • Kaike Wan's avatar
      IB/rdmavt: Reset all QPs when the device is shut down · 7697672c
      Kaike Wan authored
      commit f92e4871 upstream.
      
      When the hfi1 device is shut down during a system reboot, it is possible
      that some QPs might have not not freed by ULPs. More requests could be
      post sent and a lingering timer could be triggered to schedule more packet
      sends, leading to a crash:
      
        BUG: unable to handle kernel NULL pointer dereference at 0000000000000102
        IP: [ffffffff810a65f2] __queue_work+0x32/0x3c0
        PGD 0
        Oops: 0000 1 SMP
        Modules linked in: nvmet_rdma(OE) nvmet(OE) nvme(OE) dm_round_robin nvme_rdma(OE) nvme_fabrics(OE) nvme_core(OE) pal_raw(POE) pal_pmt(POE) pal_cache(POE) pal_pile(POE) pal(POE) pal_compatible(OE) rpcrdma sunrpc ib_isert iscsi_target_mod target_core_mod ib_iser libiscsi scsi_transport_iscsi ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib sb_edac edac_core intel_powerclamp coretemp intel_rapl iosf_mbi kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd iTCO_wdt iTCO_vendor_support mxm_wmi ipmi_ssif pcspkr ses enclosure joydev scsi_transport_sas i2c_i801 sg mei_me lpc_ich mei ioatdma shpchp ipmi_si ipmi_devintf ipmi_msghandler wmi acpi_power_meter acpi_pad dm_multipath hangcheck_timer ip_tables ext4 mbcache jbd2 mlx4_en
        sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm mlx4_core crct10dif_pclmul crct10dif_common hfi1(OE) igb crc32c_intel rdmavt(OE) ahci ib_core libahci libata ptp megaraid_sas pps_core dca i2c_algo_bit i2c_core devlink dm_mirror dm_region_hash dm_log dm_mod
        CPU: 23 PID: 0 Comm: swapper/23 Tainted: P OE ------------ 3.10.0-693.el7.x86_64 #1
        Hardware name: Intel Corporation S2600CWR/S2600CWR, BIOS SE5C610.86B.01.01.0028.121720182203 12/17/2018
        task: ffff8808f4ec4f10 ti: ffff8808f4ed8000 task.ti: ffff8808f4ed8000
        RIP: 0010:[ffffffff810a65f2] [ffffffff810a65f2] __queue_work+0x32/0x3c0
        RSP: 0018:ffff88105df43d48 EFLAGS: 00010046
        RAX: 0000000000000086 RBX: 0000000000000086 RCX: 0000000000000000
        RDX: ffff880f74e758b0 RSI: 0000000000000000 RDI: 000000000000001f
        RBP: ffff88105df43d80 R08: ffff8808f3c583c8 R09: ffff8808f3c58000
        R10: 0000000000000002 R11: ffff88105df43da8 R12: ffff880f74e758b0
        R13: 000000000000001f R14: 0000000000000000 R15: ffff88105a300000
        FS: 0000000000000000(0000) GS:ffff88105df40000(0000) knlGS:0000000000000000
        CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        CR2: 0000000000000102 CR3: 00000000019f2000 CR4: 00000000001407e0
        DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
        Stack:
        ffff88105b6dd708 0000001f00000286 0000000000000086 ffff88105a300000
        ffff880f74e75800 0000000000000000 ffff88105a300000 ffff88105df43d98
        ffffffff810a6b85 ffff88105a301e80 ffff88105df43dc8 ffffffffc0224cde
        Call Trace:
        IRQ
      
        [ffffffff810a6b85] queue_work_on+0x45/0x50
        [ffffffffc0224cde] _hfi1_schedule_send+0x6e/0xc0 [hfi1]
        [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt]
        [ffffffffc0224d62] hfi1_schedule_send+0x32/0x70 [hfi1]
        [ffffffffc0170644] rvt_rc_timeout+0xd4/0x120 [rdmavt]
        [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt]
        [ffffffff81097316] call_timer_fn+0x36/0x110
        [ffffffffc0170570] ? get_map_page+0x60/0x60 [rdmavt]
        [ffffffff8109982d] run_timer_softirq+0x22d/0x310
        [ffffffff81090b3f] __do_softirq+0xef/0x280
        [ffffffff816b6a5c] call_softirq+0x1c/0x30
        [ffffffff8102d3c5] do_softirq+0x65/0xa0
        [ffffffff81090ec5] irq_exit+0x105/0x110
        [ffffffff816b76c2] smp_apic_timer_interrupt+0x42/0x50
        [ffffffff816b5c1d] apic_timer_interrupt+0x6d/0x80
        EOI
      
        [ffffffff81527a02] ? cpuidle_enter_state+0x52/0xc0
        [ffffffff81527b48] cpuidle_idle_call+0xd8/0x210
        [ffffffff81034fee] arch_cpu_idle+0xe/0x30
        [ffffffff810e7bca] cpu_startup_entry+0x14a/0x1c0
        [ffffffff81051af6] start_secondary+0x1b6/0x230
        Code: 89 e5 41 57 41 56 49 89 f6 41 55 41 89 fd 41 54 49 89 d4 53 48 83 ec 10 89 7d d4 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 be 02 00 00 41 f6 86 02 01 00 00 01 0f 85 58 02 00 00 49 c7 c7 28 19 01 00
        RIP [ffffffff810a65f2] __queue_work+0x32/0x3c0
        RSP ffff88105df43d48
        CR2: 0000000000000102
      
      The solution is to reset the QPs before the device resources are freed.
      This reset will change the QP state to prevent post sends and delete
      timers to prevent callbacks.
      
      Fixes: 0acb0cc7 ("IB/rdmavt: Initialize and teardown of qpn table")
      Link: https://lore.kernel.org/r/20200210131040.87408.38161.stgit@awfm-01.aw.intel.comReviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
      Signed-off-by: default avatarKaike Wan <kaike.wan@intel.com>
      Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      7697672c
    • Mike Marciniszyn's avatar
      IB/hfi1: Close window for pq and request coliding · 63e58567
      Mike Marciniszyn authored
      commit be863834 upstream.
      
      Cleaning up a pq can result in the following warning and panic:
      
        WARNING: CPU: 52 PID: 77418 at lib/list_debug.c:53 __list_del_entry+0x63/0xd0
        list_del corruption, ffff88cb2c6ac068->next is LIST_POISON1 (dead000000000100)
        Modules linked in: mmfs26(OE) mmfslinux(OE) tracedev(OE) 8021q garp mrp ib_isert iscsi_target_mod target_core_mod crc_t10dif crct10dif_generic opa_vnic rpcrdma ib_iser libiscsi scsi_transport_iscsi ib_ipoib(OE) bridge stp llc iTCO_wdt iTCO_vendor_support intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crct10dif_pclmul crct10dif_common crc32_pclmul ghash_clmulni_intel ast aesni_intel ttm lrw gf128mul glue_helper ablk_helper drm_kms_helper cryptd syscopyarea sysfillrect sysimgblt fb_sys_fops drm pcspkr joydev lpc_ich mei_me drm_panel_orientation_quirks i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler nfit libnvdimm acpi_power_meter acpi_pad hfi1(OE) rdmavt(OE) rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core binfmt_misc numatools(OE) xpmem(OE) ip_tables
         nfsv3 nfs_acl nfs lockd grace sunrpc fscache igb ahci i2c_algo_bit libahci dca ptp libata pps_core crc32c_intel [last unloaded: i2c_algo_bit]
        CPU: 52 PID: 77418 Comm: pvbatch Kdump: loaded Tainted: G           OE  ------------   3.10.0-957.38.3.el7.x86_64 #1
        Hardware name: HPE.COM HPE SGI 8600-XA730i Gen10/X11DPT-SB-SG007, BIOS SBED1229 01/22/2019
        Call Trace:
         [<ffffffff90365ac0>] dump_stack+0x19/0x1b
         [<ffffffff8fc98b78>] __warn+0xd8/0x100
         [<ffffffff8fc98bff>] warn_slowpath_fmt+0x5f/0x80
         [<ffffffff8ff970c3>] __list_del_entry+0x63/0xd0
         [<ffffffff8ff9713d>] list_del+0xd/0x30
         [<ffffffff8fddda70>] kmem_cache_destroy+0x50/0x110
         [<ffffffffc0328130>] hfi1_user_sdma_free_queues+0xf0/0x200 [hfi1]
         [<ffffffffc02e2350>] hfi1_file_close+0x70/0x1e0 [hfi1]
         [<ffffffff8fe4519c>] __fput+0xec/0x260
         [<ffffffff8fe453fe>] ____fput+0xe/0x10
         [<ffffffff8fcbfd1b>] task_work_run+0xbb/0xe0
         [<ffffffff8fc2bc65>] do_notify_resume+0xa5/0xc0
         [<ffffffff90379134>] int_signal+0x12/0x17
        BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
        IP: [<ffffffff8fe1f93e>] kmem_cache_close+0x7e/0x300
        PGD 2cdab19067 PUD 2f7bfdb067 PMD 0
        Oops: 0000 [#1] SMP
        Modules linked in: mmfs26(OE) mmfslinux(OE) tracedev(OE) 8021q garp mrp ib_isert iscsi_target_mod target_core_mod crc_t10dif crct10dif_generic opa_vnic rpcrdma ib_iser libiscsi scsi_transport_iscsi ib_ipoib(OE) bridge stp llc iTCO_wdt iTCO_vendor_support intel_powerclamp coretemp intel_rapl iosf_mbi kvm_intel kvm irqbypass crct10dif_pclmul crct10dif_common crc32_pclmul ghash_clmulni_intel ast aesni_intel ttm lrw gf128mul glue_helper ablk_helper drm_kms_helper cryptd syscopyarea sysfillrect sysimgblt fb_sys_fops drm pcspkr joydev lpc_ich mei_me drm_panel_orientation_quirks i2c_i801 mei wmi ipmi_si ipmi_devintf ipmi_msghandler nfit libnvdimm acpi_power_meter acpi_pad hfi1(OE) rdmavt(OE) rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_core binfmt_misc numatools(OE) xpmem(OE) ip_tables
         nfsv3 nfs_acl nfs lockd grace sunrpc fscache igb ahci i2c_algo_bit libahci dca ptp libata pps_core crc32c_intel [last unloaded: i2c_algo_bit]
        CPU: 52 PID: 77418 Comm: pvbatch Kdump: loaded Tainted: G        W  OE  ------------   3.10.0-957.38.3.el7.x86_64 #1
        Hardware name: HPE.COM HPE SGI 8600-XA730i Gen10/X11DPT-SB-SG007, BIOS SBED1229 01/22/2019
        task: ffff88cc26db9040 ti: ffff88b5393a8000 task.ti: ffff88b5393a8000
        RIP: 0010:[<ffffffff8fe1f93e>]  [<ffffffff8fe1f93e>] kmem_cache_close+0x7e/0x300
        RSP: 0018:ffff88b5393abd60  EFLAGS: 00010287
        RAX: 0000000000000000 RBX: ffff88cb2c6ac000 RCX: 0000000000000003
        RDX: 0000000000000400 RSI: 0000000000000400 RDI: ffffffff9095b800
        RBP: ffff88b5393abdb0 R08: ffffffff9095b808 R09: ffffffff8ff77c19
        R10: ffff88b73ce1f160 R11: ffffddecddde9800 R12: ffff88cb2c6ac000
        R13: 000000000000000c R14: ffff88cf3fdca780 R15: 0000000000000000
        FS:  00002aaaaab52500(0000) GS:ffff88b73ce00000(0000) knlGS:0000000000000000
        CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
        CR2: 0000000000000010 CR3: 0000002d27664000 CR4: 00000000007607e0
        DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
        DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
        PKRU: 55555554
        Call Trace:
         [<ffffffff8fe20d44>] __kmem_cache_shutdown+0x14/0x80
         [<ffffffff8fddda78>] kmem_cache_destroy+0x58/0x110
         [<ffffffffc0328130>] hfi1_user_sdma_free_queues+0xf0/0x200 [hfi1]
         [<ffffffffc02e2350>] hfi1_file_close+0x70/0x1e0 [hfi1]
         [<ffffffff8fe4519c>] __fput+0xec/0x260
         [<ffffffff8fe453fe>] ____fput+0xe/0x10
         [<ffffffff8fcbfd1b>] task_work_run+0xbb/0xe0
         [<ffffffff8fc2bc65>] do_notify_resume+0xa5/0xc0
         [<ffffffff90379134>] int_signal+0x12/0x17
        Code: 00 00 ba 00 04 00 00 0f 4f c2 3d 00 04 00 00 89 45 bc 0f 84 e7 01 00 00 48 63 45 bc 49 8d 04 c4 48 89 45 b0 48 8b 80 c8 00 00 00 <48> 8b 78 10 48 89 45 c0 48 83 c0 10 48 89 45 d0 48 8b 17 48 39
        RIP  [<ffffffff8fe1f93e>] kmem_cache_close+0x7e/0x300
         RSP <ffff88b5393abd60>
        CR2: 0000000000000010
      
      The panic is the result of slab entries being freed during the destruction
      of the pq slab.
      
      The code attempts to quiesce the pq, but looking for n_req == 0 doesn't
      account for new requests.
      
      Fix the issue by using SRCU to get a pq pointer and adjust the pq free
      logic to NULL the fd pq pointer prior to the quiesce.
      
      Fixes: e87473bc ("IB/hfi1: Only set fd pointer when base context is completely initialized")
      Link: https://lore.kernel.org/r/20200210131033.87408.81174.stgit@awfm-01.aw.intel.comReviewed-by: default avatarKaike Wan <kaike.wan@intel.com>
      Signed-off-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
      Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      63e58567
    • Kaike Wan's avatar
      IB/hfi1: Acquire lock to release TID entries when user file is closed · 910b1399
      Kaike Wan authored
      commit a70ed0f2 upstream.
      
      Each user context is allocated a certain number of RcvArray (TID)
      entries and these entries are managed through TID groups. These groups
      are put into one of three lists in each user context: tid_group_list,
      tid_used_list, and tid_full_list, depending on the number of used TID
      entries within each group. When TID packets are expected, one or more
      TID groups will be allocated. After the packets are received, the TID
      groups will be freed. Since multiple user threads may access the TID
      groups simultaneously, a mutex exp_mutex is used to synchronize the
      access. However, when the user file is closed, it tries to release
      all TID groups without acquiring the mutex first, which risks a race
      condition with another thread that may be releasing its TID groups,
      leading to data corruption.
      
      This patch addresses the issue by acquiring the mutex first before
      releasing the TID groups when the file is closed.
      
      Fixes: 3abb33ac ("staging/hfi1: Add TID cache receive init and free funcs")
      Link: https://lore.kernel.org/r/20200210131026.87408.86853.stgit@awfm-01.aw.intel.comReviewed-by: default avatarMike Marciniszyn <mike.marciniszyn@intel.com>
      Signed-off-by: default avatarKaike Wan <kaike.wan@intel.com>
      Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@intel.com>
      Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      910b1399
    • Yi Zhang's avatar
      nvme: fix the parameter order for nvme_get_log in nvme_get_fw_slot_info · e517ef19
      Yi Zhang authored
      commit f25372ff upstream.
      
      nvme fw-activate operation will get bellow warning log,
      fix it by update the parameter order
      
      [  113.231513] nvme nvme0: Get FW SLOT INFO log error
      
      Fixes: 0e98719b ("nvme: simplify the API for getting log pages")
      Reported-by: default avatarSujith Pandel <sujith_pandel@dell.com>
      Reviewed-by: default avatarDavid Milburn <dmilburn@redhat.com>
      Signed-off-by: default avatarYi Zhang <yi.zhang@redhat.com>
      Signed-off-by: default avatarKeith Busch <kbusch@kernel.org>
      Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      e517ef19
    • Kim Phillips's avatar
      perf/x86/amd: Add missing L2 misses event spec to AMD Family 17h's event map · a4fc3b99
      Kim Phillips authored
      commit 25d38728 upstream.
      
      Commit 3fe3331b ("perf/x86/amd: Add event map for AMD Family 17h"),
      claimed L2 misses were unsupported, due to them not being found in its
      referenced documentation, whose link has now moved [1].
      
      That old documentation listed PMCx064 unit mask bit 3 as:
      
          "LsRdBlkC: LS Read Block C S L X Change to X Miss."
      
      and bit 0 as:
      
          "IcFillMiss: IC Fill Miss"
      
      We now have new public documentation [2] with improved descriptions, that
      clearly indicate what events those unit mask bits represent:
      
      Bit 3 now clearly states:
      
          "LsRdBlkC: Data Cache Req Miss in L2 (all types)"
      
      and bit 0 is:
      
          "IcFillMiss: Instruction Cache Req Miss in L2."
      
      So we can now add support for L2 misses in perf's genericised events as
      PMCx064 with both the above unit masks.
      
      [1] The commit's original documentation reference, "Processor Programming
          Reference (PPR) for AMD Family 17h Model 01h, Revision B1 Processors",
          originally available here:
      
              https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf
      
          is now available here:
      
              https://developer.amd.com/wordpress/media/2017/11/54945_PPR_Family_17h_Models_00h-0Fh.pdf
      
      [2] "Processor Programming Reference (PPR) for Family 17h Model 31h,
          Revision B0 Processors", available here:
      
      	https://developer.amd.com/wp-content/resources/55803_0.54-PUB.pdf
      
      Fixes: 3fe3331b ("perf/x86/amd: Add event map for AMD Family 17h")
      Reported-by: default avatarBabu Moger <babu.moger@amd.com>
      Signed-off-by: default avatarKim Phillips <kim.phillips@amd.com>
      Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
      Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
      Tested-by: default avatarBabu Moger <babu.moger@amd.com>
      Cc: stable@vger.kernel.org
      Link: https://lkml.kernel.org/r/20200121171232.28839-1-kim.phillips@amd.comSigned-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      a4fc3b99