1. 09 Mar, 2018 40 commits
    • Mauro Carvalho Chehab's avatar
      media: m88ds3103: don't call a non-initalized function · 1ba2b9e0
      Mauro Carvalho Chehab authored
      commit b9c97c67 upstream.
      
      If m88d3103 chip ID is not recognized, the device is not initialized.
      
      However, it returns from probe without any error, causing this OOPS:
      
      [    7.689289] Unable to handle kernel NULL pointer dereference at virtual address 00000000
      [    7.689297] pgd = 7b0bd7a7
      [    7.689302] [00000000] *pgd=00000000
      [    7.689318] Internal error: Oops: 80000005 [#1] SMP ARM
      [    7.689322] Modules linked in: dvb_usb_dvbsky(+) m88ds3103 dvb_usb_v2 dvb_core videobuf2_vmalloc videobuf2_memops videobuf2_core crc32_arm_ce videodev media
      [    7.689358] CPU: 3 PID: 197 Comm: systemd-udevd Not tainted 4.15.0-mcc+ #23
      [    7.689361] Hardware name: BCM2835
      [    7.689367] PC is at 0x0
      [    7.689382] LR is at m88ds3103_attach+0x194/0x1d0 [m88ds3103]
      [    7.689386] pc : [<00000000>]    lr : [<bf0ae1ec>]    psr: 60000013
      [    7.689391] sp : ed8e5c20  ip : ed8c1e00  fp : ed8945c0
      [    7.689395] r10: ed894000  r9 : ed894378  r8 : eda736c0
      [    7.689400] r7 : ed894070  r6 : ed8e5c44  r5 : bf0bb040  r4 : eda77600
      [    7.689405] r3 : 00000000  r2 : 00000000  r1 : 00000000  r0 : eda77600
      [    7.689412] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
      [    7.689417] Control: 10c5383d  Table: 2d8e806a  DAC: 00000051
      [    7.689423] Process systemd-udevd (pid: 197, stack limit = 0xe9dbfb63)
      [    7.689428] Stack: (0xed8e5c20 to 0xed8e6000)
      [    7.689439] 5c20: ed853a80 eda73640 ed894000 ed8942c0 ed853a80 bf0b9e98 ed894070 bf0b9f10
      [    7.689449] 5c40: 00000000 00000000 bf08c17c c08dfc50 00000000 00000000 00000000 00000000
      [    7.689459] 5c60: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
      [    7.689468] 5c80: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
      [    7.689479] 5ca0: 00000000 00000000 ed8945c0 ed8942c0 ed894000 ed894830 bf0b9e98 00000000
      [    7.689490] 5cc0: ed894378 bf0a3cb4 bf0bc3b0 0000533b ed920540 00000000 00000034 bf0a6434
      [    7.689500] 5ce0: ee952070 ed826600 bf0a7038 bf0a2dd8 00000001 bf0a6768 bf0a2f90 ed8943c0
      [    7.689511] 5d00: 00000000 c08eca68 ed826620 ed826620 00000000 ee952070 bf0bc034 ee952000
      [    7.689521] 5d20: ed826600 bf0bb080 ffffffed c0aa9e9c c0aa9dac ed826620 c16edf6c c168c2c8
      [    7.689531] 5d40: c16edf70 00000000 bf0bc034 0000000d 00000000 c08e268c bf0bb080 ed826600
      [    7.689541] 5d60: bf0bc034 ed826654 ed826620 bf0bc034 c164c8bc 00000000 00000001 00000000
      [    7.689553] 5d80: 00000028 c08e2948 00000000 bf0bc034 c08e2848 c08e0778 ee9f0a58 ed88bab4
      [    7.689563] 5da0: bf0bc034 ed90ba80 c168c1f0 c08e1934 bf0bb3bc c17045ac bf0bc034 c164c8bc
      [    7.689574] 5dc0: bf0bc034 bf0bb3bc ed91f564 c08e34ec bf0bc000 c164c8bc bf0bc034 c0aa8dc4
      [    7.689584] 5de0: ffffe000 00000000 bf0bf000 ed91f600 ed91f564 c03021e4 00000001 00000000
      [    7.689595] 5e00: c166e040 8040003f ed853a80 bf0bc448 00000000 c1678174 ed853a80 f0f22000
      [    7.689605] 5e20: f0f21fff 8040003f 014000c0 ed91e700 ed91e700 c16d8e68 00000001 ed91e6c0
      [    7.689615] 5e40: bf0bc400 00000001 bf0bc400 ed91f564 00000001 00000000 00000028 c03c9a24
      [    7.689625] 5e60: 00000001 c03c8c94 ed8e5f50 ed8e5f50 00000001 bf0bc400 ed91f540 c03c8cb0
      [    7.689637] 5e80: bf0bc40c 00007fff bf0bc400 c03c60b0 00000000 bf0bc448 00000028 c0e09684
      [    7.689647] 5ea0: 00000002 bf0bc530 c1234bf8 bf0bc5dc bf0bc514 c10ebbe8 ffffe000 bf000000
      [    7.689657] 5ec0: 00011538 00000000 ed8e5f48 00000000 00000000 00000000 00000000 00000000
      [    7.689666] 5ee0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
      [    7.689676] 5f00: 00000000 00000000 7fffffff 00000000 00000013 b6e55a18 0000017b c0309104
      [    7.689686] 5f20: ed8e4000 00000000 00510af0 c03c9430 7fffffff 00000000 00000003 00000000
      [    7.689697] 5f40: 00000000 f0f0f000 00011538 00000000 f0f107b0 f0f0f000 00011538 f0f1fdb8
      [    7.689707] 5f60: f0f1fbe8 f0f1b974 00004000 000041e0 bf0bc3d0 00000001 00000000 000024c4
      [    7.689717] 5f80: 0000002d 0000002e 00000019 00000000 00000010 00000000 16894000 00000000
      [    7.689727] 5fa0: 00000000 c0308f20 16894000 00000000 00000013 b6e55a18 00000000 b6e5652c
      [    7.689737] 5fc0: 16894000 00000000 00000000 0000017b 00020000 00508110 00000000 00510af0
      [    7.689748] 5fe0: bef68948 bef68938 b6e4d3d0 b6d32590 60000010 00000013 00000000 00000000
      [    7.689790] [<bf0ae1ec>] (m88ds3103_attach [m88ds3103]) from [<bf0b9f10>] (dvbsky_s960c_attach+0x78/0x280 [dvb_usb_dvbsky])
      [    7.689821] [<bf0b9f10>] (dvbsky_s960c_attach [dvb_usb_dvbsky]) from [<bf0a3cb4>] (dvb_usbv2_probe+0xa3c/0x1024 [dvb_usb_v2])
      [    7.689849] [<bf0a3cb4>] (dvb_usbv2_probe [dvb_usb_v2]) from [<c0aa9e9c>] (usb_probe_interface+0xf0/0x2a8)
      [    7.689869] [<c0aa9e9c>] (usb_probe_interface) from [<c08e268c>] (driver_probe_device+0x2f8/0x4b4)
      [    7.689881] [<c08e268c>] (driver_probe_device) from [<c08e2948>] (__driver_attach+0x100/0x11c)
      [    7.689895] [<c08e2948>] (__driver_attach) from [<c08e0778>] (bus_for_each_dev+0x4c/0x9c)
      [    7.689909] [<c08e0778>] (bus_for_each_dev) from [<c08e1934>] (bus_add_driver+0x1c0/0x264)
      [    7.689919] [<c08e1934>] (bus_add_driver) from [<c08e34ec>] (driver_register+0x78/0xf4)
      [    7.689931] [<c08e34ec>] (driver_register) from [<c0aa8dc4>] (usb_register_driver+0x70/0x134)
      [    7.689946] [<c0aa8dc4>] (usb_register_driver) from [<c03021e4>] (do_one_initcall+0x44/0x168)
      [    7.689963] [<c03021e4>] (do_one_initcall) from [<c03c9a24>] (do_init_module+0x64/0x1f4)
      [    7.689979] [<c03c9a24>] (do_init_module) from [<c03c8cb0>] (load_module+0x20a0/0x25c8)
      [    7.689993] [<c03c8cb0>] (load_module) from [<c03c9430>] (SyS_finit_module+0xb4/0xec)
      [    7.690007] [<c03c9430>] (SyS_finit_module) from [<c0308f20>] (ret_fast_syscall+0x0/0x54)
      [    7.690018] Code: bad PC value
      
      This may happen on normal circumstances, if, for some reason, the demod
      hangs and start returning an invalid chip ID:
      
      [   10.394395] m88ds3103 3-0068: Unknown device. Chip_id=00
      
      So, change the logic to cause probe to fail with -ENODEV, preventing
      the OOPS.
      
      Detected while testing DVB MMAP patches on Raspberry Pi 3 with
      DVBSky S960CI.
      
      Cc: stable@vger.kernel.org
      Signed-off-by: default avatarMauro Carvalho Chehab <mchehab@s-opensource.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      1ba2b9e0
    • Ming Lei's avatar
      blk-mq: don't call io sched's .requeue_request when requeueing rq to ->dispatch · ccddee81
      Ming Lei authored
      commit 105976f5 upstream.
      
      __blk_mq_requeue_request() covers two cases:
      
      - one is that the requeued request is added to hctx->dispatch, such as
      blk_mq_dispatch_rq_list()
      
      - another case is that the request is requeued to io scheduler, such as
      blk_mq_requeue_request().
      
      We should call io sched's .requeue_request callback only for the 2nd
      case.
      
      Cc: Paolo Valente <paolo.valente@linaro.org>
      Cc: Omar Sandoval <osandov@fb.com>
      Fixes: bd166ef1 ("blk-mq-sched: add framework for MQ capable IO schedulers")
      Cc: stable@vger.kernel.org
      Reviewed-by: default avatarBart Van Assche <bart.vanassche@wdc.com>
      Acked-by: default avatarPaolo Valente <paolo.valente@linaro.org>
      Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
      Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ccddee81
    • Julian Wiedmann's avatar
      s390/qeth: fix IPA command submission race · c5f32462
      Julian Wiedmann authored
      
      [ Upstream commit d22ffb5a ]
      
      If multiple IPA commands are build & sent out concurrently,
      fill_ipacmd_header() may assign a seqno value to a command that's
      different from what send_control_data() later assigns to this command's
      reply.
      This is due to other commands passing through send_control_data(),
      and incrementing card->seqno.ipa along the way.
      
      So one IPA command has no reply that's waiting for its seqno, while some
      other IPA command has multiple reply objects waiting for it.
      Only one of those waiting replies wins, and the other(s) times out and
      triggers a recovery via send_ipa_cmd().
      
      Fix this by making sure that the same seqno value is assigned to
      a command and its reply object.
      Do so immediately before submitting the command & while holding the
      irq_pending "lock", to produce nicely ascending seqnos.
      
      As a side effect, *all* IPA commands now use a reply object that's
      waiting for its actual seqno. Previously, early IPA commands that were
      submitted while the card was still DOWN used the "catch-all" IDX seqno.
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      c5f32462
    • Julian Wiedmann's avatar
      s390/qeth: fix IP address lookup for L3 devices · eae17c40
      Julian Wiedmann authored
      
      [ Upstream commit c5c48c58 ]
      
      Current code ("qeth_l3_ip_from_hash()") matches a queried address object
      against objects in the IP table by IP address, Mask/Prefix Length and
      MAC address ("qeth_l3_ipaddrs_is_equal()"). But what callers actually
      require is either
      a) "is this IP address registered" (ie. match by IP address only),
      before adding a new address.
      b) or "is this address object registered" (ie. match all relevant
         attributes), before deleting an address.
      
      Right now
      1. the ADD path is too strict in its lookup, and eg. doesn't detect
      conflicts between an existing NORMAL address and a new VIPA address
      (because the NORMAL address will have mask != 0, while VIPA has
      a mask == 0),
      2. the DELETE path is not strict enough, and eg. allows del_rxip() to
      delete a VIPA address as long as the IP address matches.
      
      Fix all this by adding helpers (_addr_match_ip() and _addr_match_all())
      that do the appropriate checking.
      
      Note that the ADD path for NORMAL addresses is special, as qeth keeps
      track of how many times such an address is in use (and there is no
      immediate way of returning errors to the caller). So when a requested
      NORMAL address _fully_ matches an existing one, it's not considered a
      conflict and we merely increment the refcount.
      
      Fixes: 5f78e29c ("qeth: optimize IP handling in rx_mode callback")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      eae17c40
    • Julian Wiedmann's avatar
      Revert "s390/qeth: fix using of ref counter for rxip addresses" · 87c4789f
      Julian Wiedmann authored
      
      [ Upstream commit 4964c66f ]
      
      This reverts commit cb816192.
      
      The issue this attempted to fix never actually occurs.
      l3_add_rxip() checks (via l3_ip_from_hash()) if the requested address
      was previously added to the card. If so, it returns -EEXIST and doesn't
      call l3_add_ip().
      As a result, the "address exists" path in l3_add_ip() is never taken
      for rxip addresses, and this patch had no effect.
      
      Fixes: cb816192 ("s390/qeth: fix using of ref counter for rxip addresses")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      87c4789f
    • Julian Wiedmann's avatar
      s390/qeth: fix double-free on IP add/remove race · 56f662db
      Julian Wiedmann authored
      
      [ Upstream commit 14d066c3 ]
      
      Registering an IPv4 address with the HW takes quite a while, so we
      temporarily drop the ip_htable lock. Any concurrent add/remove of the
      same IP adjusts the IP's use count, and (on remove) is then blocked by
      addr->in_progress.
      After the register call has completed, we check the use count for
      concurrently attempted add/remove calls - and possibly straight-away
      deregister the IP again. This happens via l3_delete_ip(), which
      1) looks up the queried IP in the htable (getting a reference to the
         *same* queried object),
      2) deregisters the IP from the HW, and
      3) frees the IP object.
      
      The caller in l3_add_ip() then does a second free on the same object.
      
      For this case, skip all the extra checks and lookups in l3_delete_ip()
      and just deregister & free the IP object ourselves.
      
      Fixes: 5f78e29c ("qeth: optimize IP handling in rx_mode callback")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      56f662db
    • Julian Wiedmann's avatar
      s390/qeth: fix IP removal on offline cards · 02763710
      Julian Wiedmann authored
      
      [ Upstream commit 98d823ab ]
      
      If the HW is not reachable, then none of the IPs in qeth's internal
      table has been registered with the HW yet. So when deleting such an IP,
      there's no need to stage it for deregistration - just drop it from
      the table.
      
      This fixes the "add-delete-add" scenario on an offline card, where the
      the second "add" merely increments the IP's use count. But as the IP is
      still set to DISP_ADDR_DELETE from the previous "delete" step,
      l3_recover_ip() won't register it with the HW when the card goes online.
      
      Fixes: 5f78e29c ("qeth: optimize IP handling in rx_mode callback")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      02763710
    • Julian Wiedmann's avatar
      s390/qeth: fix overestimated count of buffer elements · fa4919e3
      Julian Wiedmann authored
      
      [ Upstream commit 12472af8 ]
      
      qeth_get_elements_for_range() doesn't know how to handle a 0-length
      range (ie. start == end), and returns 1 when it should return 0.
      Such ranges occur on TSO skbs, where the L2/L3/L4 headers (and thus all
      of the skb's linear data) are skipped when mapping the skb into regular
      buffer elements.
      
      This overestimation may cause several performance-related issues:
      1. sub-optimal IO buffer selection, where the next buffer gets selected
         even though the skb would actually still fit into the current buffer.
      2. forced linearization, if the element count for a non-linear skb
         exceeds QETH_MAX_BUFFER_ELEMENTS.
      
      Rather than modifying qeth_get_elements_for_range() and adding overhead
      to every caller, fix up those callers that are in risk of passing a
      0-length range.
      
      Fixes: 2863c613 ("qeth: refactor calculation of SBALE count")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      fa4919e3
    • Julian Wiedmann's avatar
      s390/qeth: fix SETIP command handling · 128c7e69
      Julian Wiedmann authored
      
      [ Upstream commit 1c5b2216 ]
      
      send_control_data() applies some special handling to SETIP v4 IPA
      commands. But current code parses *all* command types for the SETIP
      command code. Limit the command code check to IPA commands.
      
      Fixes: 5b54e16f ("qeth: do not spin for SETIP ip assist command")
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      128c7e69
    • Ursula Braun's avatar
      s390/qeth: fix underestimated count of buffer elements · fcdfb9d8
      Ursula Braun authored
      
      [ Upstream commit 89271c65 ]
      
      For a memory range/skb where the last byte falls onto a page boundary
      (ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the
      calculation currently doesn't round up to the next PFN due to an
      off-by-one error.
      Thus qeth believes that the skb occupies one page less than it
      actually does, and may select a IO buffer that doesn't have enough spare
      buffer elements to fit all of the skb's data.
      HW detects this as a malformed buffer descriptor, and raises an
      exception which then triggers device recovery.
      
      Fixes: 2863c613 ("qeth: refactor calculation of SBALE count")
      Signed-off-by: default avatarUrsula Braun <ubraun@linux.vnet.ibm.com>
      Signed-off-by: default avatarJulian Wiedmann <jwi@linux.vnet.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      fcdfb9d8
    • Jason Wang's avatar
      virtio-net: disable NAPI only when enabled during XDP set · 99a78194
      Jason Wang authored
      
      [ Upstream commit 4e09ff53 ]
      
      We try to disable NAPI to prevent a single XDP TX queue being used by
      multiple cpus. But we don't check if device is up (NAPI is enabled),
      this could result stall because of infinite wait in
      napi_disable(). Fixing this by checking device state through
      netif_running() before.
      
      Fixes: 4941d472 ("virtio-net: do not reset during XDP set")
      Signed-off-by: default avatarJason Wang <jasowang@redhat.com>
      Acked-by: default avatarMichael S. Tsirkin <mst@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      99a78194
    • Jason Wang's avatar
      tuntap: disable preemption during XDP processing · 5134b919
      Jason Wang authored
      
      [ Upstream commit 23e43f07 ]
      
      Except for tuntap, all other drivers' XDP was implemented at NAPI
      poll() routine in a bh. This guarantees all XDP operation were done at
      the same CPU which is required by e.g BFP_MAP_TYPE_PERCPU_ARRAY. But
      for tuntap, we do it in process context and we try to protect XDP
      processing by RCU reader lock. This is insufficient since
      CONFIG_PREEMPT_RCU can preempt the RCU reader critical section which
      breaks the assumption that all XDP were processed in the same CPU.
      
      Fixing this by simply disabling preemption during XDP processing.
      
      Fixes: 761876c8 ("tap: XDP support")
      Signed-off-by: default avatarJason Wang <jasowang@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      5134b919
    • Jason Wang's avatar
      tuntap: correctly add the missing XDP flush · 1903344b
      Jason Wang authored
      
      [ Upstream commit 1bb4f2e8 ]
      
      We don't flush batched XDP packets through xdp_do_flush_map(), this
      will cause packets stall at TX queue. Consider we don't do XDP on NAPI
      poll(), the only possible fix is to call xdp_do_flush_map()
      immediately after xdp_do_redirect().
      
      Note, this in fact won't try to batch packets through devmap, we could
      address in the future.
      Reported-by: default avatarChristoffer Dall <christoffer.dall@linaro.org>
      Fixes: 761876c8 ("tap: XDP support")
      Signed-off-by: default avatarJason Wang <jasowang@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      1903344b
    • Soheil Hassas Yeganeh's avatar
      tcp: purge write queue upon RST · abb4a8b8
      Soheil Hassas Yeganeh authored
      
      [ Upstream commit a27fd7a8 ]
      
      When the connection is reset, there is no point in
      keeping the packets on the write queue until the connection
      is closed.
      
      RFC 793 (page 70) and RFC 793-bis (page 64) both suggest
      purging the write queue upon RST:
      https://tools.ietf.org/html/draft-ietf-tcpm-rfc793bis-07
      
      Moreover, this is essential for a correct MSG_ZEROCOPY
      implementation, because userspace cannot call close(fd)
      before receiving zerocopy signals even when the connection
      is reset.
      
      Fixes: f214f915 ("tcp: enable MSG_ZEROCOPY")
      Signed-off-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
      Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
      Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      abb4a8b8
    • Jason A. Donenfeld's avatar
      netlink: put module reference if dump start fails · eec434c5
      Jason A. Donenfeld authored
      
      [ Upstream commit b87b6194 ]
      
      Before, if cb->start() failed, the module reference would never be put,
      because cb->cb_running is intentionally false at this point. Users are
      generally annoyed by this because they can no longer unload modules that
      leak references. Also, it may be possible to tediously wrap a reference
      counter back to zero, especially since module.c still uses atomic_inc
      instead of refcount_inc.
      
      This patch expands the error path to simply call module_put if
      cb->start() fails.
      
      Fixes: 41c87425 ("netlink: do not set cb_running if dump's start() errs")
      Signed-off-by: default avatarJason A. Donenfeld <Jason@zx2c4.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      eec434c5
    • Ido Schimmel's avatar
      mlxsw: spectrum_router: Do not unconditionally clear route offload indication · abd7663b
      Ido Schimmel authored
      
      [ Upstream commit d1c95af3 ]
      
      When mlxsw replaces (or deletes) a route it removes the offload
      indication from the replaced route. This is problematic for IPv4 routes,
      as the offload indication is stored in the fib_info which is usually
      shared between multiple routes.
      
      Instead of unconditionally clearing the offload indication, only clear
      it if no other route is using the fib_info.
      
      Fixes: 3984d1a8 ("mlxsw: spectrum_router: Provide offload indication using nexthop flags")
      Signed-off-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Reported-by: default avatarAlexander Petrovskiy <alexpe@mellanox.com>
      Tested-by: default avatarAlexander Petrovskiy <alexpe@mellanox.com>
      Signed-off-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      abd7663b
    • Paolo Abeni's avatar
      cls_u32: fix use after free in u32_destroy_key() · ebadf888
      Paolo Abeni authored
      
      [ Upstream commit d7cdee5e ]
      
      Li Shuang reported an Oops with cls_u32 due to an use-after-free
      in u32_destroy_key(). The use-after-free can be triggered with:
      
      dev=lo
      tc qdisc add dev $dev root handle 1: htb default 10
      tc filter add dev $dev parent 1: prio 5 handle 1: protocol ip u32 divisor 256
      tc filter add dev $dev protocol ip parent 1: prio 5 u32 ht 800:: match ip dst\
       10.0.0.0/8 hashkey mask 0x0000ff00 at 16 link 1:
      tc qdisc del dev $dev root
      
      Which causes the following kasan splat:
      
       ==================================================================
       BUG: KASAN: use-after-free in u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
       Read of size 4 at addr ffff881b83dae618 by task kworker/u48:5/571
      
       CPU: 17 PID: 571 Comm: kworker/u48:5 Not tainted 4.15.0+ #87
       Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016
       Workqueue: tc_filter_workqueue u32_delete_key_freepf_work [cls_u32]
       Call Trace:
        dump_stack+0xd6/0x182
        ? dma_virt_map_sg+0x22e/0x22e
        print_address_description+0x73/0x290
        kasan_report+0x277/0x360
        ? u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
        u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
        u32_delete_key_freepf_work+0x1c/0x30 [cls_u32]
        process_one_work+0xae0/0x1c80
        ? sched_clock+0x5/0x10
        ? pwq_dec_nr_in_flight+0x3c0/0x3c0
        ? _raw_spin_unlock_irq+0x29/0x40
        ? trace_hardirqs_on_caller+0x381/0x570
        ? _raw_spin_unlock_irq+0x29/0x40
        ? finish_task_switch+0x1e5/0x760
        ? finish_task_switch+0x208/0x760
        ? preempt_notifier_dec+0x20/0x20
        ? __schedule+0x839/0x1ee0
        ? check_noncircular+0x20/0x20
        ? firmware_map_remove+0x73/0x73
        ? find_held_lock+0x39/0x1c0
        ? worker_thread+0x434/0x1820
        ? lock_contended+0xee0/0xee0
        ? lock_release+0x1100/0x1100
        ? init_rescuer.part.16+0x150/0x150
        ? retint_kernel+0x10/0x10
        worker_thread+0x216/0x1820
        ? process_one_work+0x1c80/0x1c80
        ? lock_acquire+0x1a5/0x540
        ? lock_downgrade+0x6b0/0x6b0
        ? sched_clock+0x5/0x10
        ? lock_release+0x1100/0x1100
        ? compat_start_thread+0x80/0x80
        ? do_raw_spin_trylock+0x190/0x190
        ? _raw_spin_unlock_irq+0x29/0x40
        ? trace_hardirqs_on_caller+0x381/0x570
        ? _raw_spin_unlock_irq+0x29/0x40
        ? finish_task_switch+0x1e5/0x760
        ? finish_task_switch+0x208/0x760
        ? preempt_notifier_dec+0x20/0x20
        ? __schedule+0x839/0x1ee0
        ? kmem_cache_alloc_trace+0x143/0x320
        ? firmware_map_remove+0x73/0x73
        ? sched_clock+0x5/0x10
        ? sched_clock_cpu+0x18/0x170
        ? find_held_lock+0x39/0x1c0
        ? schedule+0xf3/0x3b0
        ? lock_downgrade+0x6b0/0x6b0
        ? __schedule+0x1ee0/0x1ee0
        ? do_wait_intr_irq+0x340/0x340
        ? do_raw_spin_trylock+0x190/0x190
        ? _raw_spin_unlock_irqrestore+0x32/0x60
        ? process_one_work+0x1c80/0x1c80
        ? process_one_work+0x1c80/0x1c80
        kthread+0x312/0x3d0
        ? kthread_create_worker_on_cpu+0xc0/0xc0
        ret_from_fork+0x3a/0x50
      
       Allocated by task 1688:
        kasan_kmalloc+0xa0/0xd0
        __kmalloc+0x162/0x380
        u32_change+0x1220/0x3c9e [cls_u32]
        tc_ctl_tfilter+0x1ba6/0x2f80
        rtnetlink_rcv_msg+0x4f0/0x9d0
        netlink_rcv_skb+0x124/0x320
        netlink_unicast+0x430/0x600
        netlink_sendmsg+0x8fa/0xd60
        sock_sendmsg+0xb1/0xe0
        ___sys_sendmsg+0x678/0x980
        __sys_sendmsg+0xc4/0x210
        do_syscall_64+0x232/0x7f0
        return_from_SYSCALL_64+0x0/0x75
      
       Freed by task 112:
        kasan_slab_free+0x71/0xc0
        kfree+0x114/0x320
        rcu_process_callbacks+0xc3f/0x1600
        __do_softirq+0x2bf/0xc06
      
       The buggy address belongs to the object at ffff881b83dae600
        which belongs to the cache kmalloc-4096 of size 4096
       The buggy address is located 24 bytes inside of
        4096-byte region [ffff881b83dae600, ffff881b83daf600)
       The buggy address belongs to the page:
       page:ffffea006e0f6a00 count:1 mapcount:0 mapping:          (null) index:0x0 compound_mapcount: 0
       flags: 0x17ffffc0008100(slab|head)
       raw: 0017ffffc0008100 0000000000000000 0000000000000000 0000000100070007
       raw: dead000000000100 dead000000000200 ffff880187c0e600 0000000000000000
       page dumped because: kasan: bad access detected
      
       Memory state around the buggy address:
        ffff881b83dae500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
        ffff881b83dae580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
       >ffff881b83dae600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                                   ^
        ffff881b83dae680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
        ffff881b83dae700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
       ==================================================================
      
      The problem is that the htnode is freed before the linked knodes and the
      latter will try to access the first at u32_destroy_key() time.
      This change addresses the issue using the htnode refcnt to guarantee
      the correct free order. While at it also add a RCU annotation,
      to keep sparse happy.
      
      v1 -> v2: use rtnl_derefence() instead of RCU read locks
      v2 -> v3:
        - don't check refcnt in u32_destroy_hnode()
        - cleaned-up u32_destroy() implementation
        - cleaned-up code comment
      v3 -> v4:
        - dropped unneeded comment
      Reported-by: default avatarLi Shuang <shuali@redhat.com>
      Fixes: c0d378ef ("net_sched: use tcf_queue_work() in u32 filter")
      Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
      Acked-by: default avatarCong Wang <xiyou.wangcong@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ebadf888
    • Tom Lendacky's avatar
      amd-xgbe: Restore PCI interrupt enablement setting on resume · fb8a84cb
      Tom Lendacky authored
      
      [ Upstream commit cfd092f2 ]
      
      After resuming from suspend, the PCI device support must re-enable the
      interrupt setting so that interrupts are actually delivered.
      Signed-off-by: default avatarTom Lendacky <thomas.lendacky@amd.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      fb8a84cb
    • Eran Ben Elisha's avatar
      net/mlx5e: Verify inline header size do not exceed SKB linear size · e7b316ac
      Eran Ben Elisha authored
      
      [ Upstream commit f600c608 ]
      
      Driver tries to copy at least MLX5E_MIN_INLINE bytes into the control
      segment of the WQE. It assumes that the linear part contains at least
      MLX5E_MIN_INLINE bytes, which can be wrong.
      
      Cited commit verified that driver will not copy more bytes into the
      inline header part that the actual size of the packet. Re-factor this
      check to make sure we do not exceed the linear part as well.
      
      This fix is aligned with the current driver's assumption that the entire
      L2 will be present in the linear part of the SKB.
      
      Fixes: 6aace17e ("net/mlx5e: Fix inline header size for small packets")
      Signed-off-by: default avatarEran Ben Elisha <eranbe@mellanox.com>
      Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      e7b316ac
    • Ido Schimmel's avatar
      bridge: Fix VLAN reference count problem · cbd173b8
      Ido Schimmel authored
      
      [ Upstream commit 0e5a82ef ]
      
      When a VLAN is added on a port, a reference is taken on the
      corresponding master VLAN entry. If it does not already exist, then it
      is created and a reference taken.
      
      However, in the second case a reference is not really taken when
      CONFIG_REFCOUNT_FULL is enabled as refcount_inc() is replaced by
      refcount_inc_not_zero().
      
      Fix this by using refcount_set() on a newly created master VLAN entry.
      
      Fixes: 25127759 ("net, bridge: convert net_bridge_vlan.refcnt from atomic_t to refcount_t")
      Signed-off-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Acked-by: default avatarNikolay Aleksandrov <nikolay@cumulusnetworks.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      cbd173b8
    • Alexey Kodanev's avatar
      sctp: fix dst refcnt leak in sctp_v6_get_dst() · 00ec3b0c
      Alexey Kodanev authored
      
      [ Upstream commit 957d761c ]
      
      When going through the bind address list in sctp_v6_get_dst() and
      the previously found address is better ('matchlen > bmatchlen'),
      the code continues to the next iteration without releasing currently
      held destination.
      
      Fix it by releasing 'bdst' before continue to the next iteration, and
      instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
      move the already existed one right after ip6_dst_lookup_flow(), i.e. we
      shouldn't proceed further if we get an error for the route lookup.
      
      Fixes: dbc2b5e9 ("sctp: fix src address selection if using secondary addresses for ipv6")
      Signed-off-by: default avatarAlexey Kodanev <alexey.kodanev@oracle.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Acked-by: default avatarMarcelo Ricardo Leitner <marcelo.leitner@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      00ec3b0c
    • David Ahern's avatar
      net: ipv4: Set addr_type in hash_keys for forwarded case · 97ba6e5f
      David Ahern authored
      
      [ Upstream commit 1fe4b118 ]
      
      The result of the skb flow dissect is copied from keys to hash_keys to
      ensure only the intended data is hashed. The original L4 hash patch
      overlooked setting the addr_type for this case; add it.
      
      Fixes: bf4e0a3d ("net: ipv4: add support for ECMP hash policy choice")
      Reported-by: default avatarIdo Schimmel <idosch@idosch.org>
      Signed-off-by: default avatarDavid Ahern <dsahern@gmail.com>
      Acked-by: default avatarNikolay Aleksandrov <nikolay@cumulusnetworks.com>
      Reviewed-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      97ba6e5f
    • Jiri Pirko's avatar
      mlxsw: spectrum_router: Fix error path in mlxsw_sp_vr_create · 73cb791f
      Jiri Pirko authored
      
      [ Upstream commit 0f2d2b27 ]
      
      Since mlxsw_sp_fib_create() and mlxsw_sp_mr_table_create()
      use ERR_PTR macro to propagate int err through return of a pointer,
      the return value is not NULL in case of failure. So if one
      of the calls fails, one of vr->fib4, vr->fib6 or vr->mr4_table
      is not NULL and mlxsw_sp_vr_is_used wrongly assumes
      that vr is in use which leads to crash like following one:
      
      [ 1293.949291] BUG: unable to handle kernel NULL pointer dereference at 00000000000006c9
      [ 1293.952729] IP: mlxsw_sp_mr_table_flush+0x15/0x70 [mlxsw_spectrum]
      
      Fix this by using local variables to hold the pointers and set vr->*
      only in case everything went fine.
      
      Fixes: 76610ebb ("mlxsw: spectrum_router: Refactor virtual router handling")
      Fixes: a3d9bc50 ("mlxsw: spectrum_router: Extend virtual routers with IPv6 support")
      Fixes: d42b0965 ("mlxsw: spectrum_router: Add multicast routes notification handling functionality")
      Signed-off-by: default avatarJiri Pirko <jiri@mellanox.com>
      Reviewed-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Signed-off-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      73cb791f
    • Yuchung Cheng's avatar
      tcp: revert F-RTO extension to detect more spurious timeouts · 0ab87ec9
      Yuchung Cheng authored
      
      [ Upstream commit fc68e171 ]
      
      This reverts commit 89fe18e4.
      
      While the patch could detect more spurious timeouts, it could cause
      poor TCP performance on broken middle-boxes that modifies TCP packets
      (e.g. receive window, SACK options). Since the performance gain is
      much smaller compared to the potential loss. The best solution is
      to fully revert the change.
      
      Fixes: 89fe18e4 ("tcp: extend F-RTO to catch more spurious timeouts")
      Reported-by: default avatarTeodor Milkov <tm@del.bg>
      Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
      Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      0ab87ec9
    • Yuchung Cheng's avatar
      tcp: revert F-RTO middle-box workaround · cc8dadb8
      Yuchung Cheng authored
      
      [ Upstream commit d4131f09 ]
      
      This reverts commit cc663f4d. While fixing
      some broken middle-boxes that modifies receive window fields, it does not
      address middle-boxes that strip off SACK options. The best solution is
      to fully revert this patch and the root F-RTO enhancement.
      
      Fixes: cc663f4d ("tcp: restrict F-RTO to work-around broken middle-boxes")
      Reported-by: default avatarTeodor Milkov <tm@del.bg>
      Signed-off-by: default avatarYuchung Cheng <ycheng@google.com>
      Signed-off-by: default avatarNeal Cardwell <ncardwell@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      cc8dadb8
    • Xin Long's avatar
      sctp: do not pr_err for the duplicated node in transport rhlist · 36728a6b
      Xin Long authored
      
      [ Upstream commit 27af86bb ]
      
      The pr_err in sctp_hash_transport was supposed to report a sctp bug
      for using rhashtable/rhlist.
      
      The err '-EEXIST' introduced in Commit cd2b7087 ("sctp: check
      duplicate node before inserting a new transport") doesn't belong
      to that case.
      
      So just return -EEXIST back without pr_err any kmsg.
      
      Fixes: cd2b7087 ("sctp: check duplicate node before inserting a new transport")
      Reported-by: default avatarWei Chen <weichen@redhat.com>
      Signed-off-by: default avatarXin Long <lucien.xin@gmail.com>
      Acked-by: default avatarMarcelo Ricardo Leitner <marcelo.leitner@gmail.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      36728a6b
    • Ivan Vecera's avatar
      net/sched: cls_u32: fix cls_u32 on filter replace · 54d6bc97
      Ivan Vecera authored
      
      [ Upstream commit eb53f7af ]
      
      The following sequence is currently broken:
      
       # tc qdisc add dev foo ingress
       # tc filter replace dev foo protocol all ingress \
         u32 match u8 0 0 action mirred egress mirror dev bar1
       # tc filter replace dev foo protocol all ingress \
         handle 800::800 pref 49152 \
         u32 match u8 0 0 action mirred egress mirror dev bar2
       Error: cls_u32: Key node flags do not match passed flags.
       We have an error talking to the kernel, -1
      
      The error comes from u32_change() when comparing new and
      existing flags. The existing ones always contains one of
      TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state.
      These flags cannot be passed from userspace so the condition
      (n->flags != flags) in u32_change() always fails.
      
      Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and
      TCA_CLS_FLAGS_IN_HW are not taken into account.
      
      Fixes: 24d3dc6d ("net/sched: cls_u32: Reflect HW offload status")
      Signed-off-by: default avatarIvan Vecera <ivecera@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      54d6bc97
    • Eric Dumazet's avatar
      net_sched: gen_estimator: fix broken estimators based on percpu stats · a01550d7
      Eric Dumazet authored
      
      [ Upstream commit a5f7add3 ]
      
      pfifo_fast got percpu stats lately, uncovering a bug I introduced last
      year in linux-4.10.
      
      I missed the fact that we have to clear our temporary storage
      before calling __gnet_stats_copy_basic() in the case of percpu stats.
      
      Without this fix, rate estimators (tc qd replace dev xxx root est 1sec
      4sec pfifo_fast) are utterly broken.
      
      Fixes: 1c0d32fd ("net_sched: gen_estimator: complete rewrite of rate estimators")
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      a01550d7
    • Inbar Karmy's avatar
      net/mlx5e: Fix loopback self test when GRO is off · 5b5be45e
      Inbar Karmy authored
      
      [ Upstream commit ef7a3518 ]
      
      When GRO is off, the transport header pointer in sk_buff is
      initialized to network's header.
      
      To find the udp header, instead of using udp_hdr() which assumes
      skb_network_header was set, manually calculate the udp header offset.
      
      Fixes: 0952da79 ("net/mlx5e: Add support for loopback selftest")
      Signed-off-by: default avatarInbar Karmy <inbark@mellanox.com>
      Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      5b5be45e
    • Tonghao Zhang's avatar
      doc: Change the min default value of tcp_wmem/tcp_rmem. · ff01f118
      Tonghao Zhang authored
      
      [ Upstream commit a61a86f8 ]
      
      The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. And the
      tcp_wmem/tcp_rmem min default values are 4096.
      
      Fixes: bd68a2a8 ("net: set SK_MEM_QUANTUM to 4096")
      Cc: Eric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarTonghao Zhang <xiangxia.m.yue@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      ff01f118
    • Eric Dumazet's avatar
      tcp_bbr: better deal with suboptimal GSO · d6a76199
      Eric Dumazet authored
      
      [ Upstream commit 350c9f48 ]
      
      BBR uses tcp_tso_autosize() in an attempt to probe what would be the
      burst sizes and to adjust cwnd in bbr_target_cwnd() with following
      gold formula :
      
      /* Allow enough full-sized skbs in flight to utilize end systems. */
      cwnd += 3 * bbr->tso_segs_goal;
      
      But GSO can be lacking or be constrained to very small
      units (ip link set dev ... gso_max_segs 2)
      
      What we really want is to have enough packets in flight so that both
      GSO and GRO are efficient.
      
      So in the case GSO is off or downgraded, we still want to have the same
      number of packets in flight as if GSO/TSO was fully operational, so
      that GRO can hopefully be working efficiently.
      
      To fix this issue, we make tcp_tso_autosize() unaware of
      sk->sk_gso_max_segs
      
      Only tcp_tso_segs() has to enforce the gso_max_segs limit.
      
      Tested:
      
      ethtool -K eth0 tso off gso off
      tc qd replace dev eth0 root pfifo_fast
      
      Before patch:
      for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
          691  (ss -temoi shows cwnd is stuck around 6 )
          667
          651
          631
          517
      
      After patch :
      # for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
         1733 (ss -temoi shows cwnd is around 386 )
         1778
         1746
         1781
         1718
      
      Fixes: 0f8782ea ("tcp_bbr: add BBR congestion control")
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Reported-by: default avatarOleksandr Natalenko <oleksandr@natalenko.name>
      Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
      Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      d6a76199
    • David Howells's avatar
      rxrpc: Fix send in rxrpc_send_data_packet() · f0a04a0e
      David Howells authored
      
      [ Upstream commit 93c62c45 ]
      
      All the kernel_sendmsg() calls in rxrpc_send_data_packet() need to send
      both parts of the iov[] buffer, but one of them does not.  Fix it so that
      it does.
      
      Without this, short IPv6 rxrpc DATA packets may be seen that have the rxrpc
      header included, but no payload.
      
      Fixes: 5a924b89 ("rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs")
      Reported-by: default avatarMarc Dionne <marc.dionne@auristor.com>
      Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      f0a04a0e
    • Ilya Lesokhin's avatar
      tcp: Honor the eor bit in tcp_mtu_probe · 17634603
      Ilya Lesokhin authored
      
      [ Upstream commit 808cf9e3 ]
      
      Avoid SKB coalescing if eor bit is set in one of the relevant
      SKBs.
      
      Fixes: c134ecb8 ("tcp: Make use of MSG_EOR in tcp_sendmsg")
      Signed-off-by: default avatarIlya Lesokhin <ilyal@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      17634603
    • Heiner Kallweit's avatar
      net: phy: fix phy_start to consider PHY_IGNORE_INTERRUPT · dcb5da20
      Heiner Kallweit authored
      
      [ Upstream commit 08f51385 ]
      
      This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added
      long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates
      also PHY state changes and we should do what the symbol says.
      
      Fixes: 84a527a4 ("net: phylib: fix interrupts re-enablement in phy_start")
      Signed-off-by: default avatarHeiner Kallweit <hkallweit1@gmail.com>
      Reviewed-by: default avatarFlorian Fainelli <f.fainelli@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      dcb5da20
    • Gal Pressman's avatar
      net/mlx5e: Specify numa node when allocating drop rq · f26693d3
      Gal Pressman authored
      
      [ Upstream commit 2f0db879 ]
      
      When allocating a drop rq, no numa node is explicitly set which means
      allocations are done on node zero. This is not necessarily the nearest
      numa node to the HCA, and even worse, might even be a memoryless numa
      node.
      
      Choose the numa_node given to us by the pci device in order to properly
      allocate the coherent dma memory instead of assuming zero is valid.
      
      Fixes: 556dd1b9 ("net/mlx5e: Set drop RQ's necessary parameters only")
      Signed-off-by: default avatarGal Pressman <galp@mellanox.com>
      Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      f26693d3
    • Shalom Toledo's avatar
      mlxsw: spectrum_switchdev: Check success of FDB add operation · 2229dd5d
      Shalom Toledo authored
      
      [ Upstream commit 0a8a1bf1 ]
      
      Until now, we assumed that in case of error when adding FDB entries, the
      write operation will fail, but this is not the case. Instead, we need to
      check that the number of entries reported in the response is equal to
      the number of entries specified in the request.
      
      Fixes: 56ade8fe ("mlxsw: spectrum: Add initial support for Spectrum ASIC")
      Reported-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Signed-off-by: default avatarShalom Toledo <shalomt@mellanox.com>
      Reviewed-by: default avatarIdo Schimmel <idosch@mellanox.com>
      Signed-off-by: default avatarJiri Pirko <jiri@mellanox.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      2229dd5d
    • Tommi Rantala's avatar
      sctp: fix dst refcnt leak in sctp_v4_get_dst · 9f02a069
      Tommi Rantala authored
      
      [ Upstream commit 4a31a6b1 ]
      
      Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
      410f0383 ("sctp: add routing output fallback"):
      
      When walking the address_list, successive ip_route_output_key() calls
      may return the same rt->dst with the reference incremented on each call.
      
      The code would not decrement the dst refcount when the dst pointer was
      identical from the previous iteration, causing the dst refcnt leak.
      
      Testcase:
        ip netns add TEST
        ip netns exec TEST ip link set lo up
        ip link add dummy0 type dummy
        ip link add dummy1 type dummy
        ip link add dummy2 type dummy
        ip link set dev dummy0 netns TEST
        ip link set dev dummy1 netns TEST
        ip link set dev dummy2 netns TEST
        ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
        ip netns exec TEST ip link set dummy0 up
        ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
        ip netns exec TEST ip link set dummy1 up
        ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
        ip netns exec TEST ip link set dummy2 up
        ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
        ip netns del TEST
      
      In 4.4 and 4.9 kernels this results to:
        [  354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
        [  364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
        [  374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
        [  384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
        [  395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
        [  405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
        ...
      
      Fixes: 410f0383 ("sctp: add routing output fallback")
      Fixes: 0ca50d12 ("sctp: fix src address selection if using secondary addresses")
      Signed-off-by: default avatarTommi Rantala <tommi.t.rantala@nokia.com>
      Acked-by: default avatarMarcelo Ricardo Leitner <marcelo.leitner@gmail.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      9f02a069
    • Gal Pressman's avatar
      net/mlx5e: Fix TCP checksum in LRO buffers · bf014cc1
      Gal Pressman authored
      
      [ Upstream commit 8babd44d ]
      
      When receiving an LRO packet, the checksum field is set by the hardware
      to the checksum of the first coalesced packet. Obviously, this checksum
      is not valid for the merged LRO packet and should be fixed.  We can use
      the CQE checksum which covers the checksum of the entire merged packet
      TCP payload to help us calculate the checksum incrementally.
      
      Tested by sending IPv4/6 traffic with LRO enabled, RX checksum disabled
      and watching nstat checksum error counters (in addition to the obvious
      bandwidth drop caused by checksum errors).
      
      This bug is usually "hidden" since LRO packets would go through the
      CHECKSUM_UNNECESSARY flow which does not validate the packet checksum.
      
      It's important to note that previous to this patch, LRO packets provided
      with CHECKSUM_UNNECESSARY are indeed packets with a correct validated
      checksum (even though the checksum inside the TCP header is incorrect),
      since the hardware LRO aggregation is terminated upon receiving a packet
      with bad checksum.
      
      Fixes: e586b3b0 ("net/mlx5: Ethernet Datapath files")
      Signed-off-by: default avatarGal Pressman <galp@mellanox.com>
      Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      bf014cc1
    • Alexey Kodanev's avatar
      udplite: fix partial checksum initialization · fecb84a8
      Alexey Kodanev authored
      
      [ Upstream commit 15f35d49 ]
      
      Since UDP-Lite is always using checksum, the following path is
      triggered when calculating pseudo header for it:
      
        udp4_csum_init() or udp6_csum_init()
          skb_checksum_init_zero_check()
            __skb_checksum_validate_complete()
      
      The problem can appear if skb->len is less than CHECKSUM_BREAK. In
      this particular case __skb_checksum_validate_complete() also invokes
      __skb_checksum_complete(skb). If UDP-Lite is using partial checksum
      that covers only part of a packet, the function will return bad
      checksum and the packet will be dropped.
      
      It can be fixed if we skip skb_checksum_init_zero_check() and only
      set the required pseudo header checksum for UDP-Lite with partial
      checksum before udp4_csum_init()/udp6_csum_init() functions return.
      
      Fixes: ed70fcfc ("net: Call skb_checksum_init in IPv4")
      Fixes: e4f45b7f ("net: Call skb_checksum_init in IPv6")
      Signed-off-by: default avatarAlexey Kodanev <alexey.kodanev@oracle.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      fecb84a8
    • Alexey Kodanev's avatar
      sctp: verify size of a new chunk in _sctp_make_chunk() · 1fc74a57
      Alexey Kodanev authored
      
      [ Upstream commit 07f2c7ab ]
      
      When SCTP makes INIT or INIT_ACK packet the total chunk length
      can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when
      transmitting these packets, e.g. the crash on sending INIT_ACK:
      
      [  597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168
                     put:120156 head:000000007aa47635 data:00000000d991c2de
                     tail:0x1d640 end:0xfec0 dev:<NULL>
      ...
      [  597.976970] ------------[ cut here ]------------
      [  598.033408] kernel BUG at net/core/skbuff.c:104!
      [  600.314841] Call Trace:
      [  600.345829]  <IRQ>
      [  600.371639]  ? sctp_packet_transmit+0x2095/0x26d0 [sctp]
      [  600.436934]  skb_put+0x16c/0x200
      [  600.477295]  sctp_packet_transmit+0x2095/0x26d0 [sctp]
      [  600.540630]  ? sctp_packet_config+0x890/0x890 [sctp]
      [  600.601781]  ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp]
      [  600.671356]  ? sctp_cmp_addr_exact+0x3f/0x90 [sctp]
      [  600.731482]  sctp_outq_flush+0x663/0x30d0 [sctp]
      [  600.788565]  ? sctp_make_init+0xbf0/0xbf0 [sctp]
      [  600.845555]  ? sctp_check_transmitted+0x18f0/0x18f0 [sctp]
      [  600.912945]  ? sctp_outq_tail+0x631/0x9d0 [sctp]
      [  600.969936]  sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp]
      [  601.041593]  ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp]
      [  601.104837]  ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp]
      [  601.175436]  ? sctp_eat_data+0x1710/0x1710 [sctp]
      [  601.233575]  sctp_do_sm+0x182/0x560 [sctp]
      [  601.284328]  ? sctp_has_association+0x70/0x70 [sctp]
      [  601.345586]  ? sctp_rcv+0xef4/0x32f0 [sctp]
      [  601.397478]  ? sctp6_rcv+0xa/0x20 [sctp]
      ...
      
      Here the chunk size for INIT_ACK packet becomes too big, mostly
      because of the state cookie (INIT packet has large size with
      many address parameters), plus additional server parameters.
      
      Later this chunk causes the panic in skb_put_data():
      
        skb_packet_transmit()
            sctp_packet_pack()
                skb_put_data(nskb, chunk->skb->data, chunk->skb->len);
      
      'nskb' (head skb) was previously allocated with packet->size
      from u16 'chunk->chunk_hdr->length'.
      
      As suggested by Marcelo we should check the chunk's length in
      _sctp_make_chunk() before trying to allocate skb for it and
      discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN.
      Signed-off-by: default avatarAlexey Kodanev <alexey.kodanev@oracle.com>
      Acked-by: default avatarMarcelo Ricardo Leitner <marcelo.leinter@gmail.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      1fc74a57