Merge tag 'drm/tegra/for-5.15-rc1' of ssh://git.freedesktop.org/git/tegra/linux into drm-next

drm/tegra: Changes for v5.15-rc1 The bulk of these changes is a more modern ABI that can be efficiently used on newer SoCs as well as older ones. The userspace parts for this are available here: - libdrm support: https://gitlab.freedesktop.org/tagr/drm/-/commits/drm-tegra-uabi-v8 - VAAPI driver: https://github.com/cyndis/vaapi-tegra-driver In addition, existing userspace from the grate reverse-engineering project has been updated to use this new ABI: - X11 driver: https://github.com/grate-driver/xf86-video-opentegra - 3D driver: https://github.com/grate-driver/grate Other than that, there's also support for display memory bandwidth management for various generations and a bit of cleanup. Signed-off-by: Dave Airlie <airlied@redhat.com> From: Thierry Reding <thierry.reding@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20210813163616.2822355-1-thierry.reding@gmail.com

Merge tag 'drm/tegra/for-5.15-rc1' of ssh://git.freedesktop.org/git/tegra/linux into drm-next
drm/tegra: Changes for v5.15-rc1 The bulk of these changes is a more modern ABI that can be efficiently used on newer SoCs as well as older ones. The userspace parts for this are available here: - libdrm support: https://gitlab.freedesktop.org/tagr/drm/-/commits/drm-tegra-uabi-v8 - VAAPI driver: https://github.com/cyndis/vaapi-tegra-driver In addition, existing userspace from the grate reverse-engineering project has been updated to use this new ABI: - X11 driver: https://github.com/grate-driver/xf86-video-opentegra - 3D driver: https://github.com/grate-driver/grate Other than that, there's also support for display memory bandwidth management for various generations and a bit of cleanup. Signed-off-by: Dave Airlie <airlied@redhat.com> From: Thierry Reding <thierry.reding@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20210813163616.2822355-1-thierry.reding@gmail.com
571a9233 · Dave Airlie · 0def4b73 · fed02893 · 571a9233 · 571a9233
Commit 571a9233 authored Aug 26, 2021 by Dave Airlie
37 changed files
--- a/drivers/gpu/drm/tegra/Kconfig
+++ b/drivers/gpu/drm/tegra/Kconfig
@@ -9,6 +9,7 @@ config DRM_TEGRA
 	select DRM_MIPI_DSI
 	select DRM_PANEL
 	select TEGRA_HOST1X
+	select INTERCONNECT
 	select IOMMU_IOVA
 	select CEC_CORE if CEC_NOTIFIER
 	help

--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -3,6 +3,9 @@ ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
 tegra-drm-y := \
 	drm.o \
+	uapi.o \
+	submit.o \
+	firewall.o \
 	gem.o \
 	fb.o \
 	dp.o \

--- a/drivers/gpu/drm/tegra/dc.c
+++ b/drivers/gpu/drm/tegra/dc.c
@@ -8,6 +8,7 @@
 #include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/iommu.h>
+#include <linux/interconnect.h>
 #include <linux/module.h>
 #include <linux/of_device.h>
 #include <linux/pm_runtime.h>
@@ -618,9 +619,14 @@ static int tegra_plane_atomic_check(struct drm_plane *plane,
 	struct tegra_dc *dc = to_tegra_dc(new_plane_state->crtc);
 	int err;
+	plane_state->peak_memory_bandwidth = 0;
+	plane_state->avg_memory_bandwidth = 0;
 	/* no need for further checks if the plane is being disabled */
-	if (!new_plane_state->crtc)
+	if (!new_plane_state->crtc) {
+		plane_state->total_peak_memory_bandwidth = 0;
 		return 0;
+	}
 	err = tegra_plane_format(new_plane_state->fb->format->format,
 				 &plane_state->format,
@@ -808,6 +814,12 @@ static struct drm_plane *tegra_primary_plane_create(struct drm_device *drm,
 	formats = dc->soc->primary_formats;
 	modifiers = dc->soc->modifiers;
+	err = tegra_plane_interconnect_init(plane);
+	if (err) {
+		kfree(plane);
+		return ERR_PTR(err);
+	}
 	err = drm_universal_plane_init(drm, &plane->base, possible_crtcs,
 				       &tegra_plane_funcs, formats,
 				       num_formats, modifiers, type, NULL);
@@ -845,12 +857,18 @@ static int tegra_cursor_atomic_check(struct drm_plane *plane,
 {
 	struct drm_plane_state *new_plane_state = drm_atomic_get_new_plane_state(state,
 										 plane);
+	struct tegra_plane_state *plane_state = to_tegra_plane_state(new_plane_state);
 	struct tegra_plane *tegra = to_tegra_plane(plane);
 	int err;
+	plane_state->peak_memory_bandwidth = 0;
+	plane_state->avg_memory_bandwidth = 0;
 	/* no need for further checks if the plane is being disabled */
-	if (!new_plane_state->crtc)
+	if (!new_plane_state->crtc) {
+		plane_state->total_peak_memory_bandwidth = 0;
 		return 0;
+	}
 	/* scaling not supported for cursor */
 	if ((new_plane_state->src_w >> 16 != new_plane_state->crtc_w) ||
@@ -1030,6 +1048,12 @@ static struct drm_plane *tegra_dc_cursor_plane_create(struct drm_device *drm,
 	if (!dc->soc->has_nvdisplay) {
 		num_formats = ARRAY_SIZE(tegra_legacy_cursor_plane_formats);
 		formats = tegra_legacy_cursor_plane_formats;
+		err = tegra_plane_interconnect_init(plane);
+		if (err) {
+			kfree(plane);
+			return ERR_PTR(err);
+		}
 	} else {
 		num_formats = ARRAY_SIZE(tegra_cursor_plane_formats);
 		formats = tegra_cursor_plane_formats;
@@ -1149,6 +1173,12 @@ static struct drm_plane *tegra_dc_overlay_plane_create(struct drm_device *drm,
 	num_formats = dc->soc->num_overlay_formats;
 	formats = dc->soc->overlay_formats;
+	err = tegra_plane_interconnect_init(plane);
+	if (err) {
+		kfree(plane);
+		return ERR_PTR(err);
+	}
 	if (!cursor)
 		type = DRM_PLANE_TYPE_OVERLAY;
 	else
@@ -1572,6 +1602,11 @@ static int tegra_dc_show_stats(struct seq_file *s, void *data)
 	seq_printf(s, "underflow: %lu\n", dc->stats.underflow);
 	seq_printf(s, "overflow: %lu\n", dc->stats.overflow);
+	seq_printf(s, "frames total: %lu\n", dc->stats.frames_total);
+	seq_printf(s, "vblank total: %lu\n", dc->stats.vblank_total);
+	seq_printf(s, "underflow total: %lu\n", dc->stats.underflow_total);
+	seq_printf(s, "overflow total: %lu\n", dc->stats.overflow_total);
 	return 0;
 }
@@ -1804,6 +1839,106 @@ static int tegra_dc_wait_idle(struct tegra_dc *dc, unsigned long timeout)
 	return -ETIMEDOUT;
 }
+static void
+tegra_crtc_update_memory_bandwidth(struct drm_crtc *crtc,
+				   struct drm_atomic_state *state,
+				   bool prepare_bandwidth_transition)
+{
+	const struct tegra_plane_state *old_tegra_state, *new_tegra_state;
+	const struct tegra_dc_state *old_dc_state, *new_dc_state;
+	u32 i, new_avg_bw, old_avg_bw, new_peak_bw, old_peak_bw;
+	const struct drm_plane_state *old_plane_state;
+	const struct drm_crtc_state *old_crtc_state;
+	struct tegra_dc_window window, old_window;
+	struct tegra_dc *dc = to_tegra_dc(crtc);
+	struct tegra_plane *tegra;
+	struct drm_plane *plane;
+	if (dc->soc->has_nvdisplay)
+		return;
+	old_crtc_state = drm_atomic_get_old_crtc_state(state, crtc);
+	old_dc_state = to_const_dc_state(old_crtc_state);
+	new_dc_state = to_const_dc_state(crtc->state);
+	if (!crtc->state->active) {
+		if (!old_crtc_state->active)
+			return;
+		/*
+		 * When CRTC is disabled on DPMS, the state of attached planes
+		 * is kept unchanged. Hence we need to enforce removal of the
+		 * bandwidths from the ICC paths.
+		 */
+		drm_atomic_crtc_for_each_plane(plane, crtc) {
+			tegra = to_tegra_plane(plane);
+			icc_set_bw(tegra->icc_mem, 0, 0);
+			icc_set_bw(tegra->icc_mem_vfilter, 0, 0);
+		}
+		return;
+	}
+	for_each_old_plane_in_state(old_crtc_state->state, plane,
+				    old_plane_state, i) {
+		old_tegra_state = to_const_tegra_plane_state(old_plane_state);
+		new_tegra_state = to_const_tegra_plane_state(plane->state);
+		tegra = to_tegra_plane(plane);
+		/*
+		 * We're iterating over the global atomic state and it contains
+		 * planes from another CRTC, hence we need to filter out the
+		 * planes unrelated to this CRTC.
+		 */
+		if (tegra->dc != dc)
+			continue;
+		new_avg_bw = new_tegra_state->avg_memory_bandwidth;
+		old_avg_bw = old_tegra_state->avg_memory_bandwidth;
+		new_peak_bw = new_tegra_state->total_peak_memory_bandwidth;
+		old_peak_bw = old_tegra_state->total_peak_memory_bandwidth;
+		/*
+		 * See the comment related to !crtc->state->active above,
+		 * which explains why bandwidths need to be updated when
+		 * CRTC is turning ON.
+		 */
+		if (new_avg_bw == old_avg_bw && new_peak_bw == old_peak_bw &&
+		    old_crtc_state->active)
+			continue;
+		window.src.h = drm_rect_height(&plane->state->src) >> 16;
+		window.dst.h = drm_rect_height(&plane->state->dst);
+		old_window.src.h = drm_rect_height(&old_plane_state->src) >> 16;
+		old_window.dst.h = drm_rect_height(&old_plane_state->dst);
+		/*
+		 * During the preparation phase (atomic_begin), the memory
+		 * freq should go high before the DC changes are committed
+		 * if bandwidth requirement goes up, otherwise memory freq
+		 * should to stay high if BW requirement goes down.  The
+		 * opposite applies to the completion phase (post_commit).
+		 */
+		if (prepare_bandwidth_transition) {
+			new_avg_bw = max(old_avg_bw, new_avg_bw);
+			new_peak_bw = max(old_peak_bw, new_peak_bw);
+			if (tegra_plane_use_vertical_filtering(tegra, &old_window))
+				window = old_window;
+		}
+		icc_set_bw(tegra->icc_mem, new_avg_bw, new_peak_bw);
+		if (tegra_plane_use_vertical_filtering(tegra, &window))
+			icc_set_bw(tegra->icc_mem_vfilter, new_avg_bw, new_peak_bw);
+		else
+			icc_set_bw(tegra->icc_mem_vfilter, 0, 0);
+	}
+}
 static void tegra_crtc_atomic_disable(struct drm_crtc *crtc,
 				      struct drm_atomic_state *state)
 {
@@ -1985,6 +2120,8 @@ static void tegra_crtc_atomic_begin(struct drm_crtc *crtc,
 {
 	unsigned long flags;
+	tegra_crtc_update_memory_bandwidth(crtc, state, true);
 	if (crtc->state->event) {
 		spin_lock_irqsave(&crtc->dev->event_lock, flags);
@@ -2017,7 +2154,207 @@ static void tegra_crtc_atomic_flush(struct drm_crtc *crtc,
 	value = tegra_dc_readl(dc, DC_CMD_STATE_CONTROL);
 }
+static bool tegra_plane_is_cursor(const struct drm_plane_state *state)
+{
+	const struct tegra_dc_soc_info *soc = to_tegra_dc(state->crtc)->soc;
+	const struct drm_format_info *fmt = state->fb->format;
+	unsigned int src_w = drm_rect_width(&state->src) >> 16;
+	unsigned int dst_w = drm_rect_width(&state->dst);
+	if (state->plane->type != DRM_PLANE_TYPE_CURSOR)
+		return false;
+	if (soc->supports_cursor)
+		return true;
+	if (src_w != dst_w || fmt->num_planes != 1 || src_w * fmt->cpp[0] > 256)
+		return false;
+	return true;
+}
+static unsigned long
+tegra_plane_overlap_mask(struct drm_crtc_state *state,
+			 const struct drm_plane_state *plane_state)
+{
+	const struct drm_plane_state *other_state;
+	const struct tegra_plane *tegra;
+	unsigned long overlap_mask = 0;
+	struct drm_plane *plane;
+	struct drm_rect rect;
+	if (!plane_state->visible || !plane_state->fb)
+		return 0;
+	/*
+	 * Data-prefetch FIFO will easily help to overcome temporal memory
+	 * pressure if other plane overlaps with the cursor plane.
+	 */
+	if (tegra_plane_is_cursor(plane_state))
+		return 0;
+	drm_atomic_crtc_state_for_each_plane_state(plane, other_state, state) {
+		rect = plane_state->dst;
+		tegra = to_tegra_plane(other_state->plane);
+		if (!other_state->visible || !other_state->fb)
+			continue;
+		/*
+		 * Ignore cursor plane overlaps because it's not practical to
+		 * assume that it contributes to the bandwidth in overlapping
+		 * area if window width is small.
+		 */
+		if (tegra_plane_is_cursor(other_state))
+			continue;
+		if (drm_rect_intersect(&rect, &other_state->dst))
+			overlap_mask |= BIT(tegra->index);
+	}
+	return overlap_mask;
+}
+static int tegra_crtc_calculate_memory_bandwidth(struct drm_crtc *crtc,
+						 struct drm_atomic_state *state)
+{
+	ulong overlap_mask[TEGRA_DC_LEGACY_PLANES_NUM] = {}, mask;
+	u32 plane_peak_bw[TEGRA_DC_LEGACY_PLANES_NUM] = {};
+	bool all_planes_overlap_simultaneously = true;
+	const struct tegra_plane_state *tegra_state;
+	const struct drm_plane_state *plane_state;
+	struct tegra_dc *dc = to_tegra_dc(crtc);
+	const struct drm_crtc_state *old_state;
+	struct drm_crtc_state *new_state;
+	struct tegra_plane *tegra;
+	struct drm_plane *plane;
+	/*
+	 * The nv-display uses shared planes.  The algorithm below assumes
+	 * maximum 3 planes per-CRTC, this assumption isn't applicable to
+	 * the nv-display.  Note that T124 support has additional windows,
+	 * but currently they aren't supported by the driver.
+	 */
+	if (dc->soc->has_nvdisplay)
+		return 0;
+	new_state = drm_atomic_get_new_crtc_state(state, crtc);
+	old_state = drm_atomic_get_old_crtc_state(state, crtc);
+	/*
+	 * For overlapping planes pixel's data is fetched for each plane at
+	 * the same time, hence bandwidths are accumulated in this case.
+	 * This needs to be taken into account for calculating total bandwidth
+	 * consumed by all planes.
+	 *
+	 * Here we get the overlapping state of each plane, which is a
+	 * bitmask of plane indices telling with what planes there is an
+	 * overlap. Note that bitmask[plane] includes BIT(plane) in order
+	 * to make further code nicer and simpler.
+	 */
+	drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, new_state) {
+		tegra_state = to_const_tegra_plane_state(plane_state);
+		tegra = to_tegra_plane(plane);
+		if (WARN_ON_ONCE(tegra->index >= TEGRA_DC_LEGACY_PLANES_NUM))
+			return -EINVAL;
+		plane_peak_bw[tegra->index] = tegra_state->peak_memory_bandwidth;
+		mask = tegra_plane_overlap_mask(new_state, plane_state);
+		overlap_mask[tegra->index] = mask;
+		if (hweight_long(mask) != 3)
+			all_planes_overlap_simultaneously = false;
+	}
+	/*
+	 * Then we calculate maximum bandwidth of each plane state.
+	 * The bandwidth includes the plane BW + BW of the "simultaneously"
+	 * overlapping planes, where "simultaneously" means areas where DC
+	 * fetches from the planes simultaneously during of scan-out process.
+	 *
+	 * For example, if plane A overlaps with planes B and C, but B and C
+	 * don't overlap, then the peak bandwidth will be either in area where
+	 * A-and-B or A-and-C planes overlap.
+	 *
+	 * The plane_peak_bw[] contains peak memory bandwidth values of
+	 * each plane, this information is needed by interconnect provider
+	 * in order to set up latency allowance based on the peak BW, see
+	 * tegra_crtc_update_memory_bandwidth().
+	 */
+	drm_atomic_crtc_state_for_each_plane_state(plane, plane_state, new_state) {
+		u32 i, old_peak_bw, new_peak_bw, overlap_bw = 0;
+		/*
+		 * Note that plane's atomic check doesn't touch the
+		 * total_peak_memory_bandwidth of enabled plane, hence the
+		 * current state contains the old bandwidth state from the
+		 * previous CRTC commit.
+		 */
+		tegra_state = to_const_tegra_plane_state(plane_state);
+		tegra = to_tegra_plane(plane);
+		for_each_set_bit(i, &overlap_mask[tegra->index], 3) {
+			if (i == tegra->index)
+				continue;
+			if (all_planes_overlap_simultaneously)
+				overlap_bw += plane_peak_bw[i];
+			else
+				overlap_bw = max(overlap_bw, plane_peak_bw[i]);
+		}
+		new_peak_bw = plane_peak_bw[tegra->index] + overlap_bw;
+		old_peak_bw = tegra_state->total_peak_memory_bandwidth;
+		/*
+		 * If plane's peak bandwidth changed (for example plane isn't
+		 * overlapped anymore) and plane isn't in the atomic state,
+		 * then add plane to the state in order to have the bandwidth
+		 * updated.
+		 */
+		if (old_peak_bw != new_peak_bw) {
+			struct tegra_plane_state *new_tegra_state;
+			struct drm_plane_state *new_plane_state;
+			new_plane_state = drm_atomic_get_plane_state(state, plane);
+			if (IS_ERR(new_plane_state))
+				return PTR_ERR(new_plane_state);
+			new_tegra_state = to_tegra_plane_state(new_plane_state);
+			new_tegra_state->total_peak_memory_bandwidth = new_peak_bw;
+		}
+	}
+	return 0;
+}
+static int tegra_crtc_atomic_check(struct drm_crtc *crtc,
+				   struct drm_atomic_state *state)
+{
+	int err;
+	err = tegra_crtc_calculate_memory_bandwidth(crtc, state);
+	if (err)
+		return err;
+	return 0;
+}
+void tegra_crtc_atomic_post_commit(struct drm_crtc *crtc,
+				   struct drm_atomic_state *state)
+{
+	/*
+	 * Display bandwidth is allowed to go down only once hardware state
+	 * is known to be armed, i.e. state was committed and VBLANK event
+	 * received.
+	 */
+	tegra_crtc_update_memory_bandwidth(crtc, state, false);
+}
 static const struct drm_crtc_helper_funcs tegra_crtc_helper_funcs = {
+	.atomic_check = tegra_crtc_atomic_check,
 	.atomic_begin = tegra_crtc_atomic_begin,
 	.atomic_flush = tegra_crtc_atomic_flush,
 	.atomic_enable = tegra_crtc_atomic_enable,
@@ -2036,6 +2373,7 @@ static irqreturn_t tegra_dc_irq(int irq, void *data)
 		/*
 		dev_dbg(dc->dev, "%s(): frame end\n", __func__);
 		*/
+		dc->stats.frames_total++;
 		dc->stats.frames++;
 	}
@@ -2044,6 +2382,7 @@ static irqreturn_t tegra_dc_irq(int irq, void *data)
 		dev_dbg(dc->dev, "%s(): vertical blank\n", __func__);
 		*/
 		drm_crtc_handle_vblank(&dc->base);
+		dc->stats.vblank_total++;
 		dc->stats.vblank++;
 	}
@@ -2051,6 +2390,7 @@ static irqreturn_t tegra_dc_irq(int irq, void *data)
 		/*
 		dev_dbg(dc->dev, "%s(): underflow\n", __func__);
 		*/
+		dc->stats.underflow_total++;
 		dc->stats.underflow++;
 	}
@@ -2058,11 +2398,13 @@ static irqreturn_t tegra_dc_irq(int irq, void *data)
 		/*
 		dev_dbg(dc->dev, "%s(): overflow\n", __func__);
 		*/
+		dc->stats.overflow_total++;
 		dc->stats.overflow++;
 	}
 	if (status & HEAD_UF_INT) {
 		dev_dbg_ratelimited(dc->dev, "%s(): head underflow\n", __func__);
+		dc->stats.underflow_total++;
 		dc->stats.underflow++;
 	}
@@ -2343,7 +2685,9 @@ static const struct tegra_dc_soc_info tegra20_dc_soc_info = {
 	.overlay_formats = tegra20_overlay_formats,
 	.modifiers = tegra20_modifiers,
 	.has_win_a_without_filters = true,
+	.has_win_b_vfilter_mem_client = true,
 	.has_win_c_without_vert_filter = true,
+	.plane_tiled_memory_bandwidth_x2 = false,
 };
 static const struct tegra_dc_soc_info tegra30_dc_soc_info = {
@@ -2363,7 +2707,9 @@ static const struct tegra_dc_soc_info tegra30_dc_soc_info = {
 	.overlay_formats = tegra20_overlay_formats,
 	.modifiers = tegra20_modifiers,
 	.has_win_a_without_filters = false,
+	.has_win_b_vfilter_mem_client = true,
 	.has_win_c_without_vert_filter = false,
+	.plane_tiled_memory_bandwidth_x2 = true,
 };
 static const struct tegra_dc_soc_info tegra114_dc_soc_info = {
@@ -2383,7 +2729,9 @@ static const struct tegra_dc_soc_info tegra114_dc_soc_info = {
 	.overlay_formats = tegra114_overlay_formats,
 	.modifiers = tegra20_modifiers,
 	.has_win_a_without_filters = false,
+	.has_win_b_vfilter_mem_client = false,
 	.has_win_c_without_vert_filter = false,
+	.plane_tiled_memory_bandwidth_x2 = true,
 };
 static const struct tegra_dc_soc_info tegra124_dc_soc_info = {
@@ -2403,7 +2751,9 @@ static const struct tegra_dc_soc_info tegra124_dc_soc_info = {
 	.overlay_formats = tegra124_overlay_formats,
 	.modifiers = tegra124_modifiers,
 	.has_win_a_without_filters = false,
+	.has_win_b_vfilter_mem_client = false,
 	.has_win_c_without_vert_filter = false,
+	.plane_tiled_memory_bandwidth_x2 = false,
 };
 static const struct tegra_dc_soc_info tegra210_dc_soc_info = {
@@ -2423,7 +2773,9 @@ static const struct tegra_dc_soc_info tegra210_dc_soc_info = {
 	.overlay_formats = tegra114_overlay_formats,
 	.modifiers = tegra124_modifiers,
 	.has_win_a_without_filters = false,
+	.has_win_b_vfilter_mem_client = false,
 	.has_win_c_without_vert_filter = false,
+	.plane_tiled_memory_bandwidth_x2 = false,
 };
 static const struct tegra_windowgroup_soc tegra186_dc_wgrps[] = {
@@ -2473,6 +2825,7 @@ static const struct tegra_dc_soc_info tegra186_dc_soc_info = {
 	.has_nvdisplay = true,
 	.wgrps = tegra186_dc_wgrps,
 	.num_wgrps = ARRAY_SIZE(tegra186_dc_wgrps),
+	.plane_tiled_memory_bandwidth_x2 = false,
 };
 static const struct tegra_windowgroup_soc tegra194_dc_wgrps[] = {
@@ -2522,6 +2875,7 @@ static const struct tegra_dc_soc_info tegra194_dc_soc_info = {
 	.has_nvdisplay = true,
 	.wgrps = tegra194_dc_wgrps,
 	.num_wgrps = ARRAY_SIZE(tegra194_dc_wgrps),
+	.plane_tiled_memory_bandwidth_x2 = false,
 };
 static const struct of_device_id tegra_dc_of_match[] = {

--- a/drivers/gpu/drm/tegra/dc.h
+++ b/drivers/gpu/drm/tegra/dc.h
@@ -15,6 +15,8 @@
 struct tegra_output;
+#define TEGRA_DC_LEGACY_PLANES_NUM	7
 struct tegra_dc_state {
 	struct drm_crtc_state base;
@@ -33,11 +35,22 @@ static inline struct tegra_dc_state *to_dc_state(struct drm_crtc_state *state)
 	return NULL;
 }
+static inline const struct tegra_dc_state *
+to_const_dc_state(const struct drm_crtc_state *state)
+{
+	return to_dc_state((struct drm_crtc_state *)state);
+}
 struct tegra_dc_stats {
 	unsigned long frames;
 	unsigned long vblank;
 	unsigned long underflow;
 	unsigned long overflow;
+	unsigned long frames_total;
+	unsigned long vblank_total;
+	unsigned long underflow_total;
+	unsigned long overflow_total;
 };
 struct tegra_windowgroup_soc {
@@ -66,7 +79,9 @@ struct tegra_dc_soc_info {
 	unsigned int num_overlay_formats;
 	const u64 *modifiers;
 	bool has_win_a_without_filters;
+	bool has_win_b_vfilter_mem_client;
 	bool has_win_c_without_vert_filter;
+	bool plane_tiled_memory_bandwidth_x2;
 };
 struct tegra_dc {
@@ -152,6 +167,8 @@ int tegra_dc_state_setup_clock(struct tegra_dc *dc,
 			       struct drm_crtc_state *crtc_state,
 			       struct clk *clk, unsigned long pclk,
 			       unsigned int div);
+void tegra_crtc_atomic_post_commit(struct drm_crtc *crtc,
+				   struct drm_atomic_state *state);
 /* from rgb.c */
 int tegra_dc_rgb_probe(struct tegra_dc *dc);

--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -21,24 +21,21 @@
 #include <drm/drm_prime.h>
 #include <drm/drm_vblank.h>
+#include "dc.h"
 #include "drm.h"
 #include "gem.h"
+#include "uapi.h"
 #define DRIVER_NAME "tegra"
 #define DRIVER_DESC "NVIDIA Tegra graphics"
 #define DRIVER_DATE "20120330"
-#define DRIVER_MAJOR 0
+#define DRIVER_MAJOR 1
 #define DRIVER_MINOR 0
 #define DRIVER_PATCHLEVEL 0
 #define CARVEOUT_SZ SZ_64M
 #define CDMA_GATHER_FETCHES_MAX_NB 16383
-struct tegra_drm_file {
-	struct idr contexts;
-	struct mutex lock;
-};
 static int tegra_atomic_check(struct drm_device *drm,
 			      struct drm_atomic_state *state)
 {
@@ -60,6 +57,17 @@ static const struct drm_mode_config_funcs tegra_drm_mode_config_funcs = {
 	.atomic_commit = drm_atomic_helper_commit,
 };
+static void tegra_atomic_post_commit(struct drm_device *drm,
+				     struct drm_atomic_state *old_state)
+{
+	struct drm_crtc_state *old_crtc_state __maybe_unused;
+	struct drm_crtc *crtc;
+	unsigned int i;
+	for_each_old_crtc_in_state(old_state, crtc, old_crtc_state, i)
+		tegra_crtc_atomic_post_commit(crtc, old_state);
+}
 static void tegra_atomic_commit_tail(struct drm_atomic_state *old_state)
 {
 	struct drm_device *drm = old_state->dev;
@@ -79,6 +87,8 @@ static void tegra_atomic_commit_tail(struct drm_atomic_state *old_state)
 	} else {
 		drm_atomic_helper_commit_tail_rpm(old_state);
 	}
+	tegra_atomic_post_commit(drm, old_state);
 }
 static const struct drm_mode_config_helper_funcs
@@ -94,7 +104,9 @@ static int tegra_drm_open(struct drm_device *drm, struct drm_file *filp)
 	if (!fpriv)
 		return -ENOMEM;
-	idr_init_base(&fpriv->contexts, 1);
+	idr_init_base(&fpriv->legacy_contexts, 1);
+	xa_init_flags(&fpriv->contexts, XA_FLAGS_ALLOC1);
+	xa_init(&fpriv->syncpoints);
 	mutex_init(&fpriv->lock);
 	filp->driver_priv = fpriv;
@@ -107,20 +119,6 @@ static void tegra_drm_context_free(struct tegra_drm_context *context)
 	kfree(context);
 }
-static struct host1x_bo *
-host1x_bo_lookup(struct drm_file *file, u32 handle)
-{
-	struct drm_gem_object *gem;
-	struct tegra_bo *bo;
-	gem = drm_gem_object_lookup(file, handle);
-	if (!gem)
-		return NULL;
-	bo = to_tegra_bo(gem);
-	return &bo->base;
-}
 static int host1x_reloc_copy_from_user(struct host1x_reloc *dest,
 				       struct drm_tegra_reloc __user *src,
 				       struct drm_device *drm,
@@ -151,11 +149,11 @@ static int host1x_reloc_copy_from_user(struct host1x_reloc *dest,
 	dest->flags = HOST1X_RELOC_READ | HOST1X_RELOC_WRITE;
-	dest->cmdbuf.bo = host1x_bo_lookup(file, cmdbuf);
+	dest->cmdbuf.bo = tegra_gem_lookup(file, cmdbuf);
 	if (!dest->cmdbuf.bo)
 		return -ENOENT;
-	dest->target.bo = host1x_bo_lookup(file, target);
+	dest->target.bo = tegra_gem_lookup(file, target);
 	if (!dest->target.bo)
 		return -ENOENT;
@@ -193,7 +191,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 		return -EINVAL;
 	job = host1x_job_alloc(context->channel, args->num_cmdbufs,
-			       args->num_relocs);
+			       args->num_relocs, false);
 	if (!job)
 		return -ENOMEM;
@@ -201,6 +199,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 	job->client = client;
 	job->class = client->class;
 	job->serialize = true;
+	job->syncpt_recovery = true;
 	/*
 	 * Track referenced BOs so that they can be unreferenced after the
@@ -237,7 +236,7 @@ int tegra_drm_submit(struct tegra_drm_context *context,
 			goto fail;
 		}
-		bo = host1x_bo_lookup(file, cmdbuf.handle);
+		bo = tegra_gem_lookup(file, cmdbuf.handle);
 		if (!bo) {
 			err = -ENOENT;
 			goto fail;
@@ -432,7 +431,7 @@ static int tegra_client_open(struct tegra_drm_file *fpriv,
 	if (err < 0)
 		return err;
-	err = idr_alloc(&fpriv->contexts, context, 1, 0, GFP_KERNEL);
+	err = idr_alloc(&fpriv->legacy_contexts, context, 1, 0, GFP_KERNEL);
 	if (err < 0) {
 		client->ops->close_channel(context);
 		return err;
@@ -487,13 +486,13 @@ static int tegra_close_channel(struct drm_device *drm, void *data,
 	mutex_lock(&fpriv->lock);
-	context = idr_find(&fpriv->contexts, args->context);
+	context = idr_find(&fpriv->legacy_contexts, args->context);
 	if (!context) {
 		err = -EINVAL;
 		goto unlock;
 	}
-	idr_remove(&fpriv->contexts, context->id);
+	idr_remove(&fpriv->legacy_contexts, context->id);
 	tegra_drm_context_free(context);
 unlock:
@@ -512,7 +511,7 @@ static int tegra_get_syncpt(struct drm_device *drm, void *data,
 	mutex_lock(&fpriv->lock);
-	context = idr_find(&fpriv->contexts, args->context);
+	context = idr_find(&fpriv->legacy_contexts, args->context);
 	if (!context) {
 		err = -ENODEV;
 		goto unlock;
@@ -541,7 +540,7 @@ static int tegra_submit(struct drm_device *drm, void *data,
 	mutex_lock(&fpriv->lock);
-	context = idr_find(&fpriv->contexts, args->context);
+	context = idr_find(&fpriv->legacy_contexts, args->context);
 	if (!context) {
 		err = -ENODEV;
 		goto unlock;
@@ -566,7 +565,7 @@ static int tegra_get_syncpt_base(struct drm_device *drm, void *data,
 	mutex_lock(&fpriv->lock);
-	context = idr_find(&fpriv->contexts, args->context);
+	context = idr_find(&fpriv->legacy_contexts, args->context);
 	if (!context) {
 		err = -ENODEV;
 		goto unlock;
@@ -735,10 +734,25 @@ static int tegra_gem_get_flags(struct drm_device *drm, void *data,
 static const struct drm_ioctl_desc tegra_drm_ioctls[] = {
 #ifdef CONFIG_DRM_TEGRA_STAGING
-	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_gem_create,
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_OPEN, tegra_drm_ioctl_channel_open,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_CLOSE, tegra_drm_ioctl_channel_close,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_MAP, tegra_drm_ioctl_channel_map,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_UNMAP, tegra_drm_ioctl_channel_unmap,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_SUBMIT, tegra_drm_ioctl_channel_submit,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_SYNCPOINT_ALLOCATE, tegra_drm_ioctl_syncpoint_allocate,
 			  DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_gem_mmap,
+	DRM_IOCTL_DEF_DRV(TEGRA_SYNCPOINT_FREE, tegra_drm_ioctl_syncpoint_free,
 			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_SYNCPOINT_WAIT, tegra_drm_ioctl_syncpoint_wait,
+			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_gem_create, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_gem_mmap, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(TEGRA_SYNCPT_READ, tegra_syncpt_read,
 			  DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(TEGRA_SYNCPT_INCR, tegra_syncpt_incr,
@@ -792,10 +806,11 @@ static void tegra_drm_postclose(struct drm_device *drm, struct drm_file *file)
 	struct tegra_drm_file *fpriv = file->driver_priv;
 	mutex_lock(&fpriv->lock);
-	idr_for_each(&fpriv->contexts, tegra_drm_context_cleanup, NULL);
+	idr_for_each(&fpriv->legacy_contexts, tegra_drm_context_cleanup, NULL);
+	tegra_drm_uapi_close_file(fpriv);
 	mutex_unlock(&fpriv->lock);
-	idr_destroy(&fpriv->contexts);
+	idr_destroy(&fpriv->legacy_contexts);
 	mutex_destroy(&fpriv->lock);
 	kfree(fpriv);
 }
@@ -853,7 +868,7 @@ static void tegra_debugfs_init(struct drm_minor *minor)
 static const struct drm_driver tegra_drm_driver = {
 	.driver_features = DRIVER_MODESET | DRIVER_GEM |
-			   DRIVER_ATOMIC | DRIVER_RENDER,
+			   DRIVER_ATOMIC | DRIVER_RENDER | DRIVER_SYNCOBJ,
 	.open = tegra_drm_open,
 	.postclose = tegra_drm_postclose,
 	.lastclose = drm_fb_helper_lastclose,
@@ -883,6 +898,14 @@ static const struct drm_driver tegra_drm_driver = {
 int tegra_drm_register_client(struct tegra_drm *tegra,
 			      struct tegra_drm_client *client)
 {
+	/*
+	 * When MLOCKs are implemented, change to allocate a shared channel
+	 * only when MLOCKs are disabled.
+	 */
+	client->shared_channel = host1x_channel_request(&client->base);
+	if (!client->shared_channel)
+		return -EBUSY;
 	mutex_lock(&tegra->clients_lock);
 	list_add_tail(&client->list, &tegra->clients);
 	client->drm = tegra;
@@ -899,6 +922,9 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
 	client->drm = NULL;
 	mutex_unlock(&tegra->clients_lock);
+	if (client->shared_channel)
+		host1x_channel_put(client->shared_channel);
 	return 0;
 }

--- a/drivers/gpu/drm/tegra/drm.h
+++ b/drivers/gpu/drm/tegra/drm.h
@@ -64,12 +64,22 @@ struct tegra_drm {
 	struct tegra_display_hub *hub;
 };
+static inline struct host1x *tegra_drm_to_host1x(struct tegra_drm *tegra)
+{
+	return dev_get_drvdata(tegra->drm->dev->parent);
+}
 struct tegra_drm_client;
 struct tegra_drm_context {
 	struct tegra_drm_client *client;
 	struct host1x_channel *channel;
+	/* Only used by legacy UAPI. */
 	unsigned int id;
+	/* Only used by new UAPI. */
+	struct xarray mappings;
 };
 struct tegra_drm_client_ops {
@@ -91,7 +101,9 @@ struct tegra_drm_client {
 	struct host1x_client base;
 	struct list_head list;
 	struct tegra_drm *drm;
+	struct host1x_channel *shared_channel;
+	/* Set by driver */
 	unsigned int version;
 	const struct tegra_drm_client_ops *ops;
 };

--- a/drivers/gpu/drm/tegra/firewall.c
+++ b/drivers/gpu/drm/tegra/firewall.c
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2010-2020 NVIDIA Corporation */
+#include "drm.h"
+#include "submit.h"
+#include "uapi.h"
+struct tegra_drm_firewall {
+	struct tegra_drm_submit_data *submit;
+	struct tegra_drm_client *client;
+	u32 *data;
+	u32 pos;
+	u32 end;
+	u32 class;
+};
+static int fw_next(struct tegra_drm_firewall *fw, u32 *word)
+{
+	if (fw->pos == fw->end)
+		return -EINVAL;
+	*word = fw->data[fw->pos++];
+	return 0;
+}
+static bool fw_check_addr_valid(struct tegra_drm_firewall *fw, u32 offset)
+{
+	u32 i;
+	for (i = 0; i < fw->submit->num_used_mappings; i++) {
+		struct tegra_drm_mapping *m = fw->submit->used_mappings[i].mapping;
+		if (offset >= m->iova && offset <= m->iova_end)
+			return true;
+	}
+	return false;
+}
+static int fw_check_reg(struct tegra_drm_firewall *fw, u32 offset)
+{
+	bool is_addr;
+	u32 word;
+	int err;
+	err = fw_next(fw, &word);
+	if (err)
+		return err;
+	if (!fw->client->ops->is_addr_reg)
+		return 0;
+	is_addr = fw->client->ops->is_addr_reg(fw->client->base.dev, fw->class,
+					       offset);
+	if (!is_addr)
+		return 0;
+	if (!fw_check_addr_valid(fw, word))
+		return -EINVAL;
+	return 0;
+}
+static int fw_check_regs_seq(struct tegra_drm_firewall *fw, u32 offset,
+			     u32 count, bool incr)
+{
+	u32 i;
+	for (i = 0; i < count; i++) {
+		if (fw_check_reg(fw, offset))
+			return -EINVAL;
+		if (incr)
+			offset++;
+	}
+	return 0;
+}
+static int fw_check_regs_mask(struct tegra_drm_firewall *fw, u32 offset,
+			      u16 mask)
+{
+	unsigned long bmask = mask;
+	unsigned int bit;
+	for_each_set_bit(bit, &bmask, 16) {
+		if (fw_check_reg(fw, offset+bit))
+			return -EINVAL;
+	}
+	return 0;
+}
+static int fw_check_regs_imm(struct tegra_drm_firewall *fw, u32 offset)
+{
+	bool is_addr;
+	is_addr = fw->client->ops->is_addr_reg(fw->client->base.dev, fw->class,
+					       offset);
+	if (is_addr)
+		return -EINVAL;
+	return 0;
+}
+static int fw_check_class(struct tegra_drm_firewall *fw, u32 class)
+{
+	if (!fw->client->ops->is_valid_class) {
+		if (class == fw->client->base.class)
+			return 0;
+		else
+			return -EINVAL;
+	}
+	if (!fw->client->ops->is_valid_class(class))
+		return -EINVAL;
+	return 0;
+}
+enum {
+	HOST1X_OPCODE_SETCLASS  = 0x00,
+	HOST1X_OPCODE_INCR      = 0x01,
+	HOST1X_OPCODE_NONINCR   = 0x02,
+	HOST1X_OPCODE_MASK      = 0x03,
+	HOST1X_OPCODE_IMM       = 0x04,
+	HOST1X_OPCODE_RESTART   = 0x05,
+	HOST1X_OPCODE_GATHER    = 0x06,
+	HOST1X_OPCODE_SETSTRMID = 0x07,
+	HOST1X_OPCODE_SETAPPID  = 0x08,
+	HOST1X_OPCODE_SETPYLD   = 0x09,
+	HOST1X_OPCODE_INCR_W    = 0x0a,
+	HOST1X_OPCODE_NONINCR_W = 0x0b,
+	HOST1X_OPCODE_GATHER_W  = 0x0c,
+	HOST1X_OPCODE_RESTART_W = 0x0d,
+	HOST1X_OPCODE_EXTEND    = 0x0e,
+};
+int tegra_drm_fw_validate(struct tegra_drm_client *client, u32 *data, u32 start,
+			  u32 words, struct tegra_drm_submit_data *submit,
+			  u32 *job_class)
+{
+	struct tegra_drm_firewall fw = {
+		.submit = submit,
+		.client = client,
+		.data = data,
+		.pos = start,
+		.end = start+words,
+		.class = *job_class,
+	};
+	bool payload_valid = false;
+	u32 payload;
+	int err;
+	while (fw.pos != fw.end) {
+		u32 word, opcode, offset, count, mask, class;
+		err = fw_next(&fw, &word);
+		if (err)
+			return err;
+		opcode = (word & 0xf0000000) >> 28;
+		switch (opcode) {
+		case HOST1X_OPCODE_SETCLASS:
+			offset = word >> 16 & 0xfff;
+			mask = word & 0x3f;
+			class = (word >> 6) & 0x3ff;
+			err = fw_check_class(&fw, class);
+			fw.class = class;
+			*job_class = class;
+			if (!err)
+				err = fw_check_regs_mask(&fw, offset, mask);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal SETCLASS(offset=0x%x, mask=0x%x, class=0x%x) at word %u",
+					 offset, mask, class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_INCR:
+			offset = (word >> 16) & 0xfff;
+			count = word & 0xffff;
+			err = fw_check_regs_seq(&fw, offset, count, true);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal INCR(offset=0x%x, count=%u) in class 0x%x at word %u",
+					 offset, count, fw.class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_NONINCR:
+			offset = (word >> 16) & 0xfff;
+			count = word & 0xffff;
+			err = fw_check_regs_seq(&fw, offset, count, false);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal NONINCR(offset=0x%x, count=%u) in class 0x%x at word %u",
+					 offset, count, fw.class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_MASK:
+			offset = (word >> 16) & 0xfff;
+			mask = word & 0xffff;
+			err = fw_check_regs_mask(&fw, offset, mask);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal MASK(offset=0x%x, mask=0x%x) in class 0x%x at word %u",
+					 offset, mask, fw.class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_IMM:
+			/* IMM cannot reasonably be used to write a pointer */
+			offset = (word >> 16) & 0xfff;
+			err = fw_check_regs_imm(&fw, offset);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal IMM(offset=0x%x) in class 0x%x at word %u",
+					 offset, fw.class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_SETPYLD:
+			payload = word & 0xffff;
+			payload_valid = true;
+			break;
+		case HOST1X_OPCODE_INCR_W:
+			if (!payload_valid)
+				return -EINVAL;
+			offset = word & 0x3fffff;
+			err = fw_check_regs_seq(&fw, offset, payload, true);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal INCR_W(offset=0x%x) in class 0x%x at word %u",
+					 offset, fw.class, fw.pos-1);
+			break;
+		case HOST1X_OPCODE_NONINCR_W:
+			if (!payload_valid)
+				return -EINVAL;
+			offset = word & 0x3fffff;
+			err = fw_check_regs_seq(&fw, offset, payload, false);
+			if (err)
+				dev_warn(client->base.dev,
+					 "illegal NONINCR(offset=0x%x) in class 0x%x at word %u",
+					 offset, fw.class, fw.pos-1);
+			break;
+		default:
+			dev_warn(client->base.dev, "illegal opcode at word %u",
+				 fw.pos-1);
+			return -EINVAL;
+		}
+		if (err)
+			return err;
+	}
+	return 0;
+}
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -707,3 +707,16 @@ struct drm_gem_object *tegra_gem_prime_import(struct drm_device *drm,
 	return &bo->gem;
 }
+struct host1x_bo *tegra_gem_lookup(struct drm_file *file, u32 handle)
+{
+	struct drm_gem_object *gem;
+	struct tegra_bo *bo;
+	gem = drm_gem_object_lookup(file, handle);
+	if (!gem)
+		return NULL;
+	bo = to_tegra_bo(gem);
+	return &bo->base;
+}
--- a/drivers/gpu/drm/tegra/gem.h
+++ b/drivers/gpu/drm/tegra/gem.h
@@ -80,4 +80,6 @@ struct dma_buf *tegra_gem_prime_export(struct drm_gem_object *gem,
 struct drm_gem_object *tegra_gem_prime_import(struct drm_device *drm,
 					      struct dma_buf *buf);
+struct host1x_bo *tegra_gem_lookup(struct drm_file *file, u32 handle);
 #endif
--- a/drivers/gpu/drm/tegra/plane.c
+++ b/drivers/gpu/drm/tegra/plane.c
@@ -4,6 +4,7 @@
 */
 #include <linux/iommu.h>
+#include <linux/interconnect.h>
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
@@ -64,6 +65,9 @@ tegra_plane_atomic_duplicate_state(struct drm_plane *plane)
 	copy->reflect_x = state->reflect_x;
 	copy->reflect_y = state->reflect_y;
 	copy->opaque = state->opaque;
+	copy->total_peak_memory_bandwidth = state->total_peak_memory_bandwidth;
+	copy->peak_memory_bandwidth = state->peak_memory_bandwidth;
+	copy->avg_memory_bandwidth = state->avg_memory_bandwidth;
 	for (i = 0; i < 2; i++)
 		copy->blending[i] = state->blending[i];
@@ -244,6 +248,78 @@ void tegra_plane_cleanup_fb(struct drm_plane *plane,
 		tegra_dc_unpin(dc, to_tegra_plane_state(state));
 }
+static int tegra_plane_calculate_memory_bandwidth(struct drm_plane_state *state)
+{
+	struct tegra_plane_state *tegra_state = to_tegra_plane_state(state);
+	unsigned int i, bpp, dst_w, dst_h, src_w, src_h, mul;
+	const struct tegra_dc_soc_info *soc;
+	const struct drm_format_info *fmt;
+	struct drm_crtc_state *crtc_state;
+	u64 avg_bandwidth, peak_bandwidth;
+	if (!state->visible)
+		return 0;
+	crtc_state = drm_atomic_get_new_crtc_state(state->state, state->crtc);
+	if (!crtc_state)
+		return -EINVAL;
+	src_w = drm_rect_width(&state->src) >> 16;
+	src_h = drm_rect_height(&state->src) >> 16;
+	dst_w = drm_rect_width(&state->dst);
+	dst_h = drm_rect_height(&state->dst);
+	fmt = state->fb->format;
+	soc = to_tegra_dc(state->crtc)->soc;
+	/*
+	 * Note that real memory bandwidth vary depending on format and
+	 * memory layout, we are not taking that into account because small
+	 * estimation error isn't important since bandwidth is rounded up
+	 * anyway.
+	 */
+	for (i = 0, bpp = 0; i < fmt->num_planes; i++) {
+		unsigned int bpp_plane = fmt->cpp[i] * 8;
+		/*
+		 * Sub-sampling is relevant for chroma planes only and vertical
+		 * readouts are not cached, hence only horizontal sub-sampling
+		 * matters.
+		 */
+		if (i > 0)
+			bpp_plane /= fmt->hsub;
+		bpp += bpp_plane;
+	}
+	/* average bandwidth in kbytes/sec */
+	avg_bandwidth  = min(src_w, dst_w) * min(src_h, dst_h);
+	avg_bandwidth *= drm_mode_vrefresh(&crtc_state->adjusted_mode);
+	avg_bandwidth  = DIV_ROUND_UP(avg_bandwidth * bpp, 8) + 999;
+	do_div(avg_bandwidth, 1000);
+	/* mode.clock in kHz, peak bandwidth in kbytes/sec */
+	peak_bandwidth = DIV_ROUND_UP(crtc_state->adjusted_mode.clock * bpp, 8);
+	/*
+	 * Tegra30/114 Memory Controller can't interleave DC memory requests
+	 * for the tiled windows because DC uses 16-bytes atom, while DDR3
+	 * uses 32-bytes atom.  Hence there is x2 memory overfetch for tiled
+	 * framebuffer and DDR3 on these SoCs.
+	 */
+	if (soc->plane_tiled_memory_bandwidth_x2 &&
+	    tegra_state->tiling.mode == TEGRA_BO_TILING_MODE_TILED)
+		mul = 2;
+	else
+		mul = 1;
+	/* ICC bandwidth in kbytes/sec */
+	tegra_state->peak_memory_bandwidth = kBps_to_icc(peak_bandwidth) * mul;
+	tegra_state->avg_memory_bandwidth  = kBps_to_icc(avg_bandwidth)  * mul;
+	return 0;
+}
 int tegra_plane_state_add(struct tegra_plane *plane,
 			  struct drm_plane_state *state)
 {
@@ -262,6 +338,10 @@ int tegra_plane_state_add(struct tegra_plane *plane,
 	if (err < 0)
 		return err;
+	err = tegra_plane_calculate_memory_bandwidth(state);
+	if (err < 0)
+		return err;
 	tegra = to_dc_state(crtc_state);
 	tegra->planes |= WIN_A_ACT_REQ << plane->index;
@@ -646,3 +726,40 @@ int tegra_plane_setup_legacy_state(struct tegra_plane *tegra,
 	return 0;
 }
+static const char * const tegra_plane_icc_names[TEGRA_DC_LEGACY_PLANES_NUM] = {
+	"wina", "winb", "winc", NULL, NULL, NULL, "cursor",
+};
+int tegra_plane_interconnect_init(struct tegra_plane *plane)
+{
+	const char *icc_name = tegra_plane_icc_names[plane->index];
+	struct device *dev = plane->dc->dev;
+	struct tegra_dc *dc = plane->dc;
+	int err;
+	if (WARN_ON(plane->index >= TEGRA_DC_LEGACY_PLANES_NUM) ||
+	    WARN_ON(!tegra_plane_icc_names[plane->index]))
+		return -EINVAL;
+	plane->icc_mem = devm_of_icc_get(dev, icc_name);
+	err = PTR_ERR_OR_ZERO(plane->icc_mem);
+	if (err) {
+		dev_err_probe(dev, err, "failed to get %s interconnect\n",
+			      icc_name);
+		return err;
+	}
+	/* plane B on T20/30 has a dedicated memory client for a 6-tap vertical filter */
+	if (plane->index == 1 && dc->soc->has_win_b_vfilter_mem_client) {
+		plane->icc_mem_vfilter = devm_of_icc_get(dev, "winb-vfilter");
+		err = PTR_ERR_OR_ZERO(plane->icc_mem_vfilter);
+		if (err) {
+			dev_err_probe(dev, err, "failed to get %s interconnect\n",
+				      "winb-vfilter");
+			return err;
+		}
+	}
+	return 0;
+}
--- a/drivers/gpu/drm/tegra/plane.h
+++ b/drivers/gpu/drm/tegra/plane.h
@@ -8,6 +8,7 @@
 #include <drm/drm_plane.h>
+struct icc_path;
 struct tegra_bo;
 struct tegra_dc;
@@ -16,6 +17,9 @@ struct tegra_plane {
 	struct tegra_dc *dc;
 	unsigned int offset;
 	unsigned int index;
+	struct icc_path *icc_mem;
+	struct icc_path *icc_mem_vfilter;
 };
 struct tegra_cursor {
@@ -52,6 +56,11 @@ struct tegra_plane_state {
 	/* used for legacy blending support only */
 	struct tegra_plane_legacy_blending_state blending[2];
 	bool opaque;
+	/* bandwidths are in ICC units, i.e. kbytes/sec */
+	u32 total_peak_memory_bandwidth;
+	u32 peak_memory_bandwidth;
+	u32 avg_memory_bandwidth;
 };
 static inline struct tegra_plane_state *
@@ -63,6 +72,12 @@ to_tegra_plane_state(struct drm_plane_state *state)
 	return NULL;
 }
+static inline const struct tegra_plane_state *
+to_const_tegra_plane_state(const struct drm_plane_state *state)
+{
+	return to_tegra_plane_state((struct drm_plane_state *)state);
+}
 extern const struct drm_plane_funcs tegra_plane_funcs;
 int tegra_plane_prepare_fb(struct drm_plane *plane,
@@ -78,5 +93,6 @@ bool tegra_plane_format_is_indexed(unsigned int format);
 bool tegra_plane_format_is_yuv(unsigned int format, bool *planar, unsigned int *bpc);
 int tegra_plane_setup_legacy_state(struct tegra_plane *tegra,
 				   struct tegra_plane_state *state);
+int tegra_plane_interconnect_init(struct tegra_plane *plane);
 #endif /* TEGRA_PLANE_H */
--- a/drivers/gpu/drm/tegra/submit.c
+++ b/drivers/gpu/drm/tegra/submit.c
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 NVIDIA Corporation */
+#include <linux/dma-fence-array.h>
+#include <linux/dma-mapping.h>
+#include <linux/file.h>
+#include <linux/host1x.h>
+#include <linux/iommu.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/nospec.h>
+#include <linux/pm_runtime.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/sync_file.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_file.h>
+#include <drm/drm_syncobj.h>
+#include "drm.h"
+#include "gem.h"
+#include "submit.h"
+#include "uapi.h"
+#define SUBMIT_ERR(context, fmt, ...) \
+	dev_err_ratelimited(context->client->base.dev, \
+		"%s: job submission failed: " fmt "\n", \
+		current->comm, ##__VA_ARGS__)
+struct gather_bo {
+	struct host1x_bo base;
+	struct kref ref;
+	struct device *dev;
+	u32 *gather_data;
+	dma_addr_t gather_data_dma;
+	size_t gather_data_words;
+};
+static struct host1x_bo *gather_bo_get(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+	kref_get(&bo->ref);
+	return host_bo;
+}
+static void gather_bo_release(struct kref *ref)
+{
+	struct gather_bo *bo = container_of(ref, struct gather_bo, ref);
+	dma_free_attrs(bo->dev, bo->gather_data_words * 4, bo->gather_data, bo->gather_data_dma,
+		       0);
+	kfree(bo);
+}
+static void gather_bo_put(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+	kref_put(&bo->ref, gather_bo_release);
+}
+static struct sg_table *
+gather_bo_pin(struct device *dev, struct host1x_bo *host_bo, dma_addr_t *phys)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+	struct sg_table *sgt;
+	int err;
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+	err = dma_get_sgtable(bo->dev, sgt, bo->gather_data, bo->gather_data_dma,
+			      bo->gather_data_words * 4);
+	if (err) {
+		kfree(sgt);
+		return ERR_PTR(err);
+	}
+	return sgt;
+}
+static void gather_bo_unpin(struct device *dev, struct sg_table *sgt)
+{
+	if (sgt) {
+		sg_free_table(sgt);
+		kfree(sgt);
+	}
+}
+static void *gather_bo_mmap(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+	return bo->gather_data;
+}
+static void gather_bo_munmap(struct host1x_bo *host_bo, void *addr)
+{
+}
+const struct host1x_bo_ops gather_bo_ops = {
+	.get = gather_bo_get,
+	.put = gather_bo_put,
+	.pin = gather_bo_pin,
+	.unpin = gather_bo_unpin,
+	.mmap = gather_bo_mmap,
+	.munmap = gather_bo_munmap,
+};
+static struct tegra_drm_mapping *
+tegra_drm_mapping_get(struct tegra_drm_context *context, u32 id)
+{
+	struct tegra_drm_mapping *mapping;
+	xa_lock(&context->mappings);
+	mapping = xa_load(&context->mappings, id);
+	if (mapping)
+		kref_get(&mapping->ref);
+	xa_unlock(&context->mappings);
+	return mapping;
+}
+static void *alloc_copy_user_array(void __user *from, size_t count, size_t size)
+{
+	size_t copy_len;
+	void *data;
+	if (check_mul_overflow(count, size, &copy_len))
+		return ERR_PTR(-EINVAL);
+	if (copy_len > 0x4000)
+		return ERR_PTR(-E2BIG);
+	data = kvmalloc(copy_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+	if (copy_from_user(data, from, copy_len)) {
+		kvfree(data);
+		return ERR_PTR(-EFAULT);
+	}
+	return data;
+}
+static int submit_copy_gather_data(struct gather_bo **pbo, struct device *dev,
+				   struct tegra_drm_context *context,
+				   struct drm_tegra_channel_submit *args)
+{
+	struct gather_bo *bo;
+	size_t copy_len;
+	if (args->gather_data_words == 0) {
+		SUBMIT_ERR(context, "gather_data_words cannot be zero");
+		return -EINVAL;
+	}
+	if (check_mul_overflow((size_t)args->gather_data_words, (size_t)4, &copy_len)) {
+		SUBMIT_ERR(context, "gather_data_words is too large");
+		return -EINVAL;
+	}
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo) {
+		SUBMIT_ERR(context, "failed to allocate memory for bo info");
+		return -ENOMEM;
+	}
+	host1x_bo_init(&bo->base, &gather_bo_ops);
+	kref_init(&bo->ref);
+	bo->dev = dev;
+	bo->gather_data = dma_alloc_attrs(dev, copy_len, &bo->gather_data_dma,
+					  GFP_KERNEL | __GFP_NOWARN, 0);
+	if (!bo->gather_data) {
+		SUBMIT_ERR(context, "failed to allocate memory for gather data");
+		kfree(bo);
+		return -ENOMEM;
+	}
+	if (copy_from_user(bo->gather_data, u64_to_user_ptr(args->gather_data_ptr), copy_len)) {
+		SUBMIT_ERR(context, "failed to copy gather data from userspace");
+		dma_free_attrs(dev, copy_len, bo->gather_data, bo->gather_data_dma, 0);
+		kfree(bo);
+		return -EFAULT;
+	}
+	bo->gather_data_words = args->gather_data_words;
+	*pbo = bo;
+	return 0;
+}
+static int submit_write_reloc(struct tegra_drm_context *context, struct gather_bo *bo,
+			      struct drm_tegra_submit_buf *buf, struct tegra_drm_mapping *mapping)
+{
+	/* TODO check that target_offset is within bounds */
+	dma_addr_t iova = mapping->iova + buf->reloc.target_offset;
+	u32 written_ptr;
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+	if (buf->flags & DRM_TEGRA_SUBMIT_RELOC_SECTOR_LAYOUT)
+		iova |= BIT_ULL(39);
+#endif
+	written_ptr = iova >> buf->reloc.shift;
+	if (buf->reloc.gather_offset_words >= bo->gather_data_words) {
+		SUBMIT_ERR(context,
+			   "relocation has too large gather offset (%u vs gather length %zu)",
+			   buf->reloc.gather_offset_words, bo->gather_data_words);
+		return -EINVAL;
+	}
+	buf->reloc.gather_offset_words = array_index_nospec(buf->reloc.gather_offset_words,
+							    bo->gather_data_words);
+	bo->gather_data[buf->reloc.gather_offset_words] = written_ptr;
+	return 0;
+}
+static int submit_process_bufs(struct tegra_drm_context *context, struct gather_bo *bo,
+			       struct drm_tegra_channel_submit *args,
+			       struct tegra_drm_submit_data *job_data)
+{
+	struct tegra_drm_used_mapping *mappings;
+	struct drm_tegra_submit_buf *bufs;
+	int err;
+	u32 i;
+	bufs = alloc_copy_user_array(u64_to_user_ptr(args->bufs_ptr), args->num_bufs,
+				     sizeof(*bufs));
+	if (IS_ERR(bufs)) {
+		SUBMIT_ERR(context, "failed to copy bufs array from userspace");
+		return PTR_ERR(bufs);
+	}
+	mappings = kcalloc(args->num_bufs, sizeof(*mappings), GFP_KERNEL);
+	if (!mappings) {
+		SUBMIT_ERR(context, "failed to allocate memory for mapping info");
+		err = -ENOMEM;
+		goto done;
+	}
+	for (i = 0; i < args->num_bufs; i++) {
+		struct drm_tegra_submit_buf *buf = &bufs[i];
+		struct tegra_drm_mapping *mapping;
+		if (buf->flags & ~DRM_TEGRA_SUBMIT_RELOC_SECTOR_LAYOUT) {
+			SUBMIT_ERR(context, "invalid flag specified for buffer");
+			err = -EINVAL;
+			goto drop_refs;
+		}
+		mapping = tegra_drm_mapping_get(context, buf->mapping);
+		if (!mapping) {
+			SUBMIT_ERR(context, "invalid mapping ID '%u' for buffer", buf->mapping);
+			err = -EINVAL;
+			goto drop_refs;
+		}
+		err = submit_write_reloc(context, bo, buf, mapping);
+		if (err) {
+			tegra_drm_mapping_put(mapping);
+			goto drop_refs;
+		}
+		mappings[i].mapping = mapping;
+		mappings[i].flags = buf->flags;
+	}
+	job_data->used_mappings = mappings;
+	job_data->num_used_mappings = i;
+	err = 0;
+	goto done;
+drop_refs:
+	while (i--)
+		tegra_drm_mapping_put(mappings[i].mapping);
+	kfree(mappings);
+	job_data->used_mappings = NULL;
+done:
+	kvfree(bufs);
+	return err;
+}
+static int submit_get_syncpt(struct tegra_drm_context *context, struct host1x_job *job,
+			     struct xarray *syncpoints, struct drm_tegra_channel_submit *args)
+{
+	struct host1x_syncpt *sp;
+	if (args->syncpt.flags) {
+		SUBMIT_ERR(context, "invalid flag specified for syncpt");
+		return -EINVAL;
+	}
+	/* Syncpt ref will be dropped on job release */
+	sp = xa_load(syncpoints, args->syncpt.id);
+	if (!sp) {
+		SUBMIT_ERR(context, "syncpoint specified in syncpt was not allocated");
+		return -EINVAL;
+	}
+	job->syncpt = host1x_syncpt_get(sp);
+	job->syncpt_incrs = args->syncpt.increments;
+	return 0;
+}
+static int submit_job_add_gather(struct host1x_job *job, struct tegra_drm_context *context,
+				 struct drm_tegra_submit_cmd_gather_uptr *cmd,
+				 struct gather_bo *bo, u32 *offset,
+				 struct tegra_drm_submit_data *job_data,
+				 u32 *class)
+{
+	u32 next_offset;
+	if (cmd->reserved[0] || cmd->reserved[1] || cmd->reserved[2]) {
+		SUBMIT_ERR(context, "non-zero reserved field in GATHER_UPTR command");
+		return -EINVAL;
+	}
+	/* Check for maximum gather size */
+	if (cmd->words > 16383) {
+		SUBMIT_ERR(context, "too many words in GATHER_UPTR command");
+		return -EINVAL;
+	}
+	if (check_add_overflow(*offset, cmd->words, &next_offset)) {
+		SUBMIT_ERR(context, "too many total words in job");
+		return -EINVAL;
+	}
+	if (next_offset > bo->gather_data_words) {
+		SUBMIT_ERR(context, "GATHER_UPTR command overflows gather data");
+		return -EINVAL;
+	}
+	if (tegra_drm_fw_validate(context->client, bo->gather_data, *offset,
+				  cmd->words, job_data, class)) {
+		SUBMIT_ERR(context, "job was rejected by firewall");
+		return -EINVAL;
+	}
+	host1x_job_add_gather(job, &bo->base, cmd->words, *offset * 4);
+	*offset = next_offset;
+	return 0;
+}
+static struct host1x_job *
+submit_create_job(struct tegra_drm_context *context, struct gather_bo *bo,
+		  struct drm_tegra_channel_submit *args, struct tegra_drm_submit_data *job_data,
+		  struct xarray *syncpoints)
+{
+	struct drm_tegra_submit_cmd *cmds;
+	u32 i, gather_offset = 0, class;
+	struct host1x_job *job;
+	int err;
+	/* Set initial class for firewall. */
+	class = context->client->base.class;
+	cmds = alloc_copy_user_array(u64_to_user_ptr(args->cmds_ptr), args->num_cmds,
+				     sizeof(*cmds));
+	if (IS_ERR(cmds)) {
+		SUBMIT_ERR(context, "failed to copy cmds array from userspace");
+		return ERR_CAST(cmds);
+	}
+	job = host1x_job_alloc(context->channel, args->num_cmds, 0, true);
+	if (!job) {
+		SUBMIT_ERR(context, "failed to allocate memory for job");
+		job = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+	err = submit_get_syncpt(context, job, syncpoints, args);
+	if (err < 0)
+		goto free_job;
+	job->client = &context->client->base;
+	job->class = context->client->base.class;
+	job->serialize = true;
+	for (i = 0; i < args->num_cmds; i++) {
+		struct drm_tegra_submit_cmd *cmd = &cmds[i];
+		if (cmd->flags) {
+			SUBMIT_ERR(context, "unknown flags given for cmd");
+			err = -EINVAL;
+			goto free_job;
+		}
+		if (cmd->type == DRM_TEGRA_SUBMIT_CMD_GATHER_UPTR) {
+			err = submit_job_add_gather(job, context, &cmd->gather_uptr, bo,
+						    &gather_offset, job_data, &class);
+			if (err)
+				goto free_job;
+		} else if (cmd->type == DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT) {
+			if (cmd->wait_syncpt.reserved[0] || cmd->wait_syncpt.reserved[1]) {
+				SUBMIT_ERR(context, "non-zero reserved value");
+				err = -EINVAL;
+				goto free_job;
+			}
+			host1x_job_add_wait(job, cmd->wait_syncpt.id, cmd->wait_syncpt.value,
+					    false, class);
+		} else if (cmd->type == DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT_RELATIVE) {
+			if (cmd->wait_syncpt.reserved[0] || cmd->wait_syncpt.reserved[1]) {
+				SUBMIT_ERR(context, "non-zero reserved value");
+				err = -EINVAL;
+				goto free_job;
+			}
+			if (cmd->wait_syncpt.id != args->syncpt.id) {
+				SUBMIT_ERR(context, "syncpoint ID in CMD_WAIT_SYNCPT_RELATIVE is not used by the job");
+				err = -EINVAL;
+				goto free_job;
+			}
+			host1x_job_add_wait(job, cmd->wait_syncpt.id, cmd->wait_syncpt.value,
+					    true, class);
+		} else {
+			SUBMIT_ERR(context, "unknown cmd type");
+			err = -EINVAL;
+			goto free_job;
+		}
+	}
+	if (gather_offset == 0) {
+		SUBMIT_ERR(context, "job must have at least one gather");
+		err = -EINVAL;
+		goto free_job;
+	}
+	goto done;
+free_job:
+	host1x_job_put(job);
+	job = ERR_PTR(err);
+done:
+	kvfree(cmds);
+	return job;
+}
+static void release_job(struct host1x_job *job)
+{
+	struct tegra_drm_client *client = container_of(job->client, struct tegra_drm_client, base);
+	struct tegra_drm_submit_data *job_data = job->user_data;
+	u32 i;
+	for (i = 0; i < job_data->num_used_mappings; i++)
+		tegra_drm_mapping_put(job_data->used_mappings[i].mapping);
+	kfree(job_data->used_mappings);
+	kfree(job_data);
+	if (pm_runtime_enabled(client->base.dev))
+		pm_runtime_put_autosuspend(client->base.dev);
+}
+int tegra_drm_ioctl_channel_submit(struct drm_device *drm, void *data,
+				   struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_channel_submit *args = data;
+	struct tegra_drm_submit_data *job_data;
+	struct drm_syncobj *syncobj = NULL;
+	struct tegra_drm_context *context;
+	struct host1x_job *job;
+	struct gather_bo *bo;
+	u32 i;
+	int err;
+	mutex_lock(&fpriv->lock);
+	context = xa_load(&fpriv->contexts, args->context);
+	if (!context) {
+		mutex_unlock(&fpriv->lock);
+		pr_err_ratelimited("%s: %s: invalid channel context '%#x'", __func__,
+				   current->comm, args->context);
+		return -EINVAL;
+	}
+	if (args->syncobj_in) {
+		struct dma_fence *fence;
+		err = drm_syncobj_find_fence(file, args->syncobj_in, 0, 0, &fence);
+		if (err) {
+			SUBMIT_ERR(context, "invalid syncobj_in '%#x'", args->syncobj_in);
+			goto unlock;
+		}
+		err = dma_fence_wait_timeout(fence, true, msecs_to_jiffies(10000));
+		dma_fence_put(fence);
+		if (err) {
+			SUBMIT_ERR(context, "wait for syncobj_in timed out");
+			goto unlock;
+		}
+	}
+	if (args->syncobj_out) {
+		syncobj = drm_syncobj_find(file, args->syncobj_out);
+		if (!syncobj) {
+			SUBMIT_ERR(context, "invalid syncobj_out '%#x'", args->syncobj_out);
+			err = -ENOENT;
+			goto unlock;
+		}
+	}
+	/* Allocate gather BO and copy gather words in. */
+	err = submit_copy_gather_data(&bo, drm->dev, context, args);
+	if (err)
+		goto unlock;
+	job_data = kzalloc(sizeof(*job_data), GFP_KERNEL);
+	if (!job_data) {
+		SUBMIT_ERR(context, "failed to allocate memory for job data");
+		err = -ENOMEM;
+		goto put_bo;
+	}
+	/* Get data buffer mappings and do relocation patching. */
+	err = submit_process_bufs(context, bo, args, job_data);
+	if (err)
+		goto free_job_data;
+	/* Allocate host1x_job and add gathers and waits to it. */
+	job = submit_create_job(context, bo, args, job_data, &fpriv->syncpoints);
+	if (IS_ERR(job)) {
+		err = PTR_ERR(job);
+		goto free_job_data;
+	}
+	/* Map gather data for Host1x. */
+	err = host1x_job_pin(job, context->client->base.dev);
+	if (err) {
+		SUBMIT_ERR(context, "failed to pin job: %d", err);
+		goto put_job;
+	}
+	/* Boot engine. */
+	if (pm_runtime_enabled(context->client->base.dev)) {
+		err = pm_runtime_resume_and_get(context->client->base.dev);
+		if (err < 0) {
+			SUBMIT_ERR(context, "could not power up engine: %d", err);
+			goto unpin_job;
+		}
+	}
+	job->user_data = job_data;
+	job->release = release_job;
+	job->timeout = 10000;
+	/*
+	 * job_data is now part of job reference counting, so don't release
+	 * it from here.
+	 */
+	job_data = NULL;
+	/* Submit job to hardware. */
+	err = host1x_job_submit(job);
+	if (err) {
+		SUBMIT_ERR(context, "host1x job submission failed: %d", err);
+		goto unpin_job;
+	}
+	/* Return postfences to userspace and add fences to DMA reservations. */
+	args->syncpt.value = job->syncpt_end;
+	if (syncobj) {
+		struct dma_fence *fence = host1x_fence_create(job->syncpt, job->syncpt_end);
+		if (IS_ERR(fence)) {
+			err = PTR_ERR(fence);
+			SUBMIT_ERR(context, "failed to create postfence: %d", err);
+		}
+		drm_syncobj_replace_fence(syncobj, fence);
+	}
+	goto put_job;
+unpin_job:
+	host1x_job_unpin(job);
+put_job:
+	host1x_job_put(job);
+free_job_data:
+	if (job_data && job_data->used_mappings) {
+		for (i = 0; i < job_data->num_used_mappings; i++)
+			tegra_drm_mapping_put(job_data->used_mappings[i].mapping);
+		kfree(job_data->used_mappings);
+	}
+	if (job_data)
+		kfree(job_data);
+put_bo:
+	gather_bo_put(&bo->base);
+unlock:
+	if (syncobj)
+		drm_syncobj_put(syncobj);
+	mutex_unlock(&fpriv->lock);
+	return err;
+}
--- a/drivers/gpu/drm/tegra/submit.h
+++ b/drivers/gpu/drm/tegra/submit.h
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2020 NVIDIA Corporation */
+#ifndef _TEGRA_DRM_UAPI_SUBMIT_H
+#define _TEGRA_DRM_UAPI_SUBMIT_H
+struct tegra_drm_used_mapping {
+	struct tegra_drm_mapping *mapping;
+	u32 flags;
+};
+struct tegra_drm_submit_data {
+	struct tegra_drm_used_mapping *used_mappings;
+	u32 num_used_mappings;
+};
+int tegra_drm_fw_validate(struct tegra_drm_client *client, u32 *data, u32 start,
+			  u32 words, struct tegra_drm_submit_data *submit,
+			  u32 *job_class);
+#endif
--- a/drivers/gpu/drm/tegra/uapi.c
+++ b/drivers/gpu/drm/tegra/uapi.c
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 NVIDIA Corporation */
+#include <linux/host1x.h>
+#include <linux/iommu.h>
+#include <linux/list.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_file.h>
+#include <drm/drm_utils.h>
+#include "drm.h"
+#include "uapi.h"
+static void tegra_drm_mapping_release(struct kref *ref)
+{
+	struct tegra_drm_mapping *mapping =
+		container_of(ref, struct tegra_drm_mapping, ref);
+	if (mapping->sgt)
+		dma_unmap_sgtable(mapping->dev, mapping->sgt, mapping->direction,
+				  DMA_ATTR_SKIP_CPU_SYNC);
+	host1x_bo_unpin(mapping->dev, mapping->bo, mapping->sgt);
+	host1x_bo_put(mapping->bo);
+	kfree(mapping);
+}
+void tegra_drm_mapping_put(struct tegra_drm_mapping *mapping)
+{
+	kref_put(&mapping->ref, tegra_drm_mapping_release);
+}
+static void tegra_drm_channel_context_close(struct tegra_drm_context *context)
+{
+	struct tegra_drm_mapping *mapping;
+	unsigned long id;
+	xa_for_each(&context->mappings, id, mapping)
+		tegra_drm_mapping_put(mapping);
+	xa_destroy(&context->mappings);
+	host1x_channel_put(context->channel);
+	kfree(context);
+}
+void tegra_drm_uapi_close_file(struct tegra_drm_file *file)
+{
+	struct tegra_drm_context *context;
+	struct host1x_syncpt *sp;
+	unsigned long id;
+	xa_for_each(&file->contexts, id, context)
+		tegra_drm_channel_context_close(context);
+	xa_for_each(&file->syncpoints, id, sp)
+		host1x_syncpt_put(sp);
+	xa_destroy(&file->contexts);
+	xa_destroy(&file->syncpoints);
+}
+static struct tegra_drm_client *tegra_drm_find_client(struct tegra_drm *tegra, u32 class)
+{
+	struct tegra_drm_client *client;
+	list_for_each_entry(client, &tegra->clients, list)
+		if (client->base.class == class)
+			return client;
+	return NULL;
+}
+int tegra_drm_ioctl_channel_open(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct tegra_drm *tegra = drm->dev_private;
+	struct drm_tegra_channel_open *args = data;
+	struct tegra_drm_client *client = NULL;
+	struct tegra_drm_context *context;
+	int err;
+	if (args->flags)
+		return -EINVAL;
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+	client = tegra_drm_find_client(tegra, args->host1x_class);
+	if (!client) {
+		err = -ENODEV;
+		goto free;
+	}
+	if (client->shared_channel) {
+		context->channel = host1x_channel_get(client->shared_channel);
+	} else {
+		context->channel = host1x_channel_request(&client->base);
+		if (!context->channel) {
+			err = -EBUSY;
+			goto free;
+		}
+	}
+	err = xa_alloc(&fpriv->contexts, &args->context, context, XA_LIMIT(1, U32_MAX),
+		       GFP_KERNEL);
+	if (err < 0)
+		goto put_channel;
+	context->client = client;
+	xa_init_flags(&context->mappings, XA_FLAGS_ALLOC1);
+	args->version = client->version;
+	args->capabilities = 0;
+	if (device_get_dma_attr(client->base.dev) == DEV_DMA_COHERENT)
+		args->capabilities |= DRM_TEGRA_CHANNEL_CAP_CACHE_COHERENT;
+	return 0;
+put_channel:
+	host1x_channel_put(context->channel);
+free:
+	kfree(context);
+	return err;
+}
+int tegra_drm_ioctl_channel_close(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_channel_close *args = data;
+	struct tegra_drm_context *context;
+	mutex_lock(&fpriv->lock);
+	context = xa_load(&fpriv->contexts, args->context);
+	if (!context) {
+		mutex_unlock(&fpriv->lock);
+		return -EINVAL;
+	}
+	xa_erase(&fpriv->contexts, args->context);
+	mutex_unlock(&fpriv->lock);
+	tegra_drm_channel_context_close(context);
+	return 0;
+}
+int tegra_drm_ioctl_channel_map(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_channel_map *args = data;
+	struct tegra_drm_mapping *mapping;
+	struct tegra_drm_context *context;
+	int err = 0;
+	if (args->flags & ~DRM_TEGRA_CHANNEL_MAP_READ_WRITE)
+		return -EINVAL;
+	mutex_lock(&fpriv->lock);
+	context = xa_load(&fpriv->contexts, args->context);
+	if (!context) {
+		mutex_unlock(&fpriv->lock);
+		return -EINVAL;
+	}
+	mapping = kzalloc(sizeof(*mapping), GFP_KERNEL);
+	if (!mapping) {
+		err = -ENOMEM;
+		goto unlock;
+	}
+	kref_init(&mapping->ref);
+	mapping->dev = context->client->base.dev;
+	mapping->bo = tegra_gem_lookup(file, args->handle);
+	if (!mapping->bo) {
+		err = -EINVAL;
+		goto unlock;
+	}
+	if (context->client->base.group) {
+		/* IOMMU domain managed directly using IOMMU API */
+		host1x_bo_pin(mapping->dev, mapping->bo, &mapping->iova);
+	} else {
+		switch (args->flags & DRM_TEGRA_CHANNEL_MAP_READ_WRITE) {
+		case DRM_TEGRA_CHANNEL_MAP_READ_WRITE:
+			mapping->direction = DMA_BIDIRECTIONAL;
+			break;
+		case DRM_TEGRA_CHANNEL_MAP_WRITE:
+			mapping->direction = DMA_FROM_DEVICE;
+			break;
+		case DRM_TEGRA_CHANNEL_MAP_READ:
+			mapping->direction = DMA_TO_DEVICE;
+			break;
+		default:
+			return -EINVAL;
+		}
+		mapping->sgt = host1x_bo_pin(mapping->dev, mapping->bo, NULL);
+		if (IS_ERR(mapping->sgt)) {
+			err = PTR_ERR(mapping->sgt);
+			goto put_gem;
+		}
+		err = dma_map_sgtable(mapping->dev, mapping->sgt, mapping->direction,
+				      DMA_ATTR_SKIP_CPU_SYNC);
+		if (err)
+			goto unpin;
+		mapping->iova = sg_dma_address(mapping->sgt->sgl);
+	}
+	mapping->iova_end = mapping->iova + host1x_to_tegra_bo(mapping->bo)->size;
+	err = xa_alloc(&context->mappings, &args->mapping, mapping, XA_LIMIT(1, U32_MAX),
+		       GFP_KERNEL);
+	if (err < 0)
+		goto unmap;
+	mutex_unlock(&fpriv->lock);
+	return 0;
+unmap:
+	if (mapping->sgt) {
+		dma_unmap_sgtable(mapping->dev, mapping->sgt, mapping->direction,
+				  DMA_ATTR_SKIP_CPU_SYNC);
+	}
+unpin:
+	host1x_bo_unpin(mapping->dev, mapping->bo, mapping->sgt);
+put_gem:
+	host1x_bo_put(mapping->bo);
+	kfree(mapping);
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
+}
+int tegra_drm_ioctl_channel_unmap(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_channel_unmap *args = data;
+	struct tegra_drm_mapping *mapping;
+	struct tegra_drm_context *context;
+	mutex_lock(&fpriv->lock);
+	context = xa_load(&fpriv->contexts, args->context);
+	if (!context) {
+		mutex_unlock(&fpriv->lock);
+		return -EINVAL;
+	}
+	mapping = xa_erase(&context->mappings, args->mapping);
+	mutex_unlock(&fpriv->lock);
+	if (!mapping)
+		return -EINVAL;
+	tegra_drm_mapping_put(mapping);
+	return 0;
+}
+int tegra_drm_ioctl_syncpoint_allocate(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct host1x *host1x = tegra_drm_to_host1x(drm->dev_private);
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_syncpoint_allocate *args = data;
+	struct host1x_syncpt *sp;
+	int err;
+	if (args->id)
+		return -EINVAL;
+	sp = host1x_syncpt_alloc(host1x, HOST1X_SYNCPT_CLIENT_MANAGED, current->comm);
+	if (!sp)
+		return -EBUSY;
+	args->id = host1x_syncpt_id(sp);
+	err = xa_insert(&fpriv->syncpoints, args->id, sp, GFP_KERNEL);
+	if (err) {
+		host1x_syncpt_put(sp);
+		return err;
+	}
+	return 0;
+}
+int tegra_drm_ioctl_syncpoint_free(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_syncpoint_allocate *args = data;
+	struct host1x_syncpt *sp;
+	mutex_lock(&fpriv->lock);
+	sp = xa_erase(&fpriv->syncpoints, args->id);
+	mutex_unlock(&fpriv->lock);
+	if (!sp)
+		return -EINVAL;
+	host1x_syncpt_put(sp);
+	return 0;
+}
+int tegra_drm_ioctl_syncpoint_wait(struct drm_device *drm, void *data, struct drm_file *file)
+{
+	struct host1x *host1x = tegra_drm_to_host1x(drm->dev_private);
+	struct drm_tegra_syncpoint_wait *args = data;
+	signed long timeout_jiffies;
+	struct host1x_syncpt *sp;
+	if (args->padding != 0)
+		return -EINVAL;
+	sp = host1x_syncpt_get_by_id_noref(host1x, args->id);
+	if (!sp)
+		return -EINVAL;
+	timeout_jiffies = drm_timeout_abs_to_jiffies(args->timeout_ns);
+	return host1x_syncpt_wait(sp, args->threshold, timeout_jiffies, &args->value);
+}
--- a/drivers/gpu/drm/tegra/uapi.h
+++ b/drivers/gpu/drm/tegra/uapi.h
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2020 NVIDIA Corporation */
+#ifndef _TEGRA_DRM_UAPI_H
+#define _TEGRA_DRM_UAPI_H
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/xarray.h>
+#include <drm/drm.h>
+struct drm_file;
+struct drm_device;
+struct tegra_drm_file {
+	/* Legacy UAPI state */
+	struct idr legacy_contexts;
+	struct mutex lock;
+	/* New UAPI state */
+	struct xarray contexts;
+	struct xarray syncpoints;
+};
+struct tegra_drm_mapping {
+	struct kref ref;
+	struct device *dev;
+	struct host1x_bo *bo;
+	struct sg_table *sgt;
+	enum dma_data_direction direction;
+	dma_addr_t iova;
+	dma_addr_t iova_end;
+};
+int tegra_drm_ioctl_channel_open(struct drm_device *drm, void *data,
+				 struct drm_file *file);
+int tegra_drm_ioctl_channel_close(struct drm_device *drm, void *data,
+				  struct drm_file *file);
+int tegra_drm_ioctl_channel_map(struct drm_device *drm, void *data,
+				struct drm_file *file);
+int tegra_drm_ioctl_channel_unmap(struct drm_device *drm, void *data,
+				  struct drm_file *file);
+int tegra_drm_ioctl_channel_submit(struct drm_device *drm, void *data,
+				   struct drm_file *file);
+int tegra_drm_ioctl_syncpoint_allocate(struct drm_device *drm, void *data,
+				       struct drm_file *file);
+int tegra_drm_ioctl_syncpoint_free(struct drm_device *drm, void *data,
+				   struct drm_file *file);
+int tegra_drm_ioctl_syncpoint_wait(struct drm_device *drm, void *data,
+				   struct drm_file *file);
+void tegra_drm_uapi_close_file(struct tegra_drm_file *file);
+void tegra_drm_mapping_put(struct tegra_drm_mapping *mapping);
+#endif
--- a/drivers/gpu/drm/tegra/vic.c
+++ b/drivers/gpu/drm/tegra/vic.c
@@ -29,7 +29,6 @@ struct vic_config {
 struct vic {
 	struct falcon falcon;
-	bool booted;
 	void __iomem *regs;
 	struct tegra_drm_client client;
@@ -52,48 +51,6 @@ static void vic_writel(struct vic *vic, u32 value, unsigned int offset)
 	writel(value, vic->regs + offset);
 }
-static int vic_runtime_resume(struct device *dev)
-{
-	struct vic *vic = dev_get_drvdata(dev);
-	int err;
-	err = clk_prepare_enable(vic->clk);
-	if (err < 0)
-		return err;
-	usleep_range(10, 20);
-	err = reset_control_deassert(vic->rst);
-	if (err < 0)
-		goto disable;
-	usleep_range(10, 20);
-	return 0;
-disable:
-	clk_disable_unprepare(vic->clk);
-	return err;
-}
-static int vic_runtime_suspend(struct device *dev)
-{
-	struct vic *vic = dev_get_drvdata(dev);
-	int err;
-	err = reset_control_assert(vic->rst);
-	if (err < 0)
-		return err;
-	usleep_range(2000, 4000);
-	clk_disable_unprepare(vic->clk);
-	vic->booted = false;
-	return 0;
-}
 static int vic_boot(struct vic *vic)
 {
 #ifdef CONFIG_IOMMU_API
@@ -103,9 +60,6 @@ static int vic_boot(struct vic *vic)
 	void *hdr;
 	int err = 0;
-	if (vic->booted)
-		return 0;
 #ifdef CONFIG_IOMMU_API
 	if (vic->config->supports_sid && spec) {
 		u32 value;
@@ -168,8 +122,6 @@ static int vic_boot(struct vic *vic)
 		return err;
 	}
-	vic->booted = true;
 	return 0;
 }
@@ -323,35 +275,74 @@ static int vic_load_firmware(struct vic *vic)
 	return err;
 }
-static int vic_open_channel(struct tegra_drm_client *client,
-			    struct tegra_drm_context *context)
+static int vic_runtime_resume(struct device *dev)
 {
-	struct vic *vic = to_vic(client);
+	struct vic *vic = dev_get_drvdata(dev);
 	int err;
-	err = pm_runtime_resume_and_get(vic->dev);
+	err = clk_prepare_enable(vic->clk);
 	if (err < 0)
 		return err;
+	usleep_range(10, 20);
+	err = reset_control_deassert(vic->rst);
+	if (err < 0)
+		goto disable;
+	usleep_range(10, 20);
 	err = vic_load_firmware(vic);
 	if (err < 0)
-		goto rpm_put;
+		goto assert;
 	err = vic_boot(vic);
 	if (err < 0)
-		goto rpm_put;
+		goto assert;
+	return 0;
+assert:
+	reset_control_assert(vic->rst);
+disable:
+	clk_disable_unprepare(vic->clk);
+	return err;
+}
+static int vic_runtime_suspend(struct device *dev)
+{
+	struct vic *vic = dev_get_drvdata(dev);
+	int err;
+	err = reset_control_assert(vic->rst);
+	if (err < 0)
+		return err;
+	usleep_range(2000, 4000);
+	clk_disable_unprepare(vic->clk);
+	return 0;
+}
+static int vic_open_channel(struct tegra_drm_client *client,
+			    struct tegra_drm_context *context)
+{
+	struct vic *vic = to_vic(client);
+	int err;
+	err = pm_runtime_resume_and_get(vic->dev);
+	if (err < 0)
+		return err;
 	context->channel = host1x_channel_get(vic->channel);
 	if (!context->channel) {
-		err = -ENOMEM;
+		pm_runtime_put(vic->dev);
-		goto rpm_put;
+		return -ENOMEM;
 	}
 	return 0;
-rpm_put:
-	pm_runtime_put(vic->dev);
-	return err;
 }
 static void vic_close_channel(struct tegra_drm_context *context)
@@ -359,7 +350,6 @@ static void vic_close_channel(struct tegra_drm_context *context)
 	struct vic *vic = to_vic(context->client);
 	host1x_channel_put(context->channel);
 	pm_runtime_put(vic->dev);
 }

--- a/drivers/gpu/host1x/Makefile
+++ b/drivers/gpu/host1x/Makefile
@@ -9,6 +9,7 @@ host1x-y = \
 	job.o \
 	debug.o \
 	mipi.o \
+	fence.o \
 	hw/host1x01.o \
 	hw/host1x02.o \
 	hw/host1x04.o \

--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -312,10 +312,6 @@ static void update_cdma_locked(struct host1x_cdma *cdma)
 	bool signal = false;
 	struct host1x_job *job, *n;
-	/* If CDMA is stopped, queue is cleared and we can return */
-	if (!cdma->running)
-		return;
 	/*
 	 * Walk the sync queue, reading the sync point registers as necessary,
 	 * to consume as many sync queue entries as possible without blocking
@@ -324,7 +320,8 @@ static void update_cdma_locked(struct host1x_cdma *cdma)
 		struct host1x_syncpt *sp = job->syncpt;
 		/* Check whether this syncpt has completed, and bail if not */
-		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
+		if (!host1x_syncpt_is_expired(sp, job->syncpt_end) &&
+		    !job->cancelled) {
 			/* Start timer on next pending syncpt */
 			if (job->timeout)
 				cdma_start_timer_locked(cdma, job);
@@ -413,8 +410,11 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 	else
 		restart_addr = cdma->last_pos;
+	if (!job)
+		goto resume;
 	/* do CPU increments for the remaining syncpts */
-	if (job) {
+	if (job->syncpt_recovery) {
 		dev_dbg(dev, "%s: perform CPU incr on pending buffers\n",
 			__func__);
@@ -433,8 +433,44 @@ void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
 		dev_dbg(dev, "%s: finished sync_queue modification\n",
 			__func__);
+	} else {
+		struct host1x_job *failed_job = job;
+		host1x_job_dump(dev, job);
+		host1x_syncpt_set_locked(job->syncpt);
+		failed_job->cancelled = true;
+		list_for_each_entry_continue(job, &cdma->sync_queue, list) {
+			unsigned int i;
+			if (job->syncpt != failed_job->syncpt)
+				continue;
+			for (i = 0; i < job->num_slots; i++) {
+				unsigned int slot = (job->first_get/8 + i) %
+						    HOST1X_PUSHBUFFER_SLOTS;
+				u32 *mapped = cdma->push_buffer.mapped;
+				/*
+				 * Overwrite opcodes with 0 word writes
+				 * to offset 0xbad. This does nothing but
+				 * has a easily detected signature in debug
+				 * traces.
+				 */
+				mapped[2*slot+0] = 0x1bad0000;
+				mapped[2*slot+1] = 0x1bad0000;
+			}
+			job->cancelled = true;
 		}
+		wmb();
+		update_cdma_locked(cdma);
+	}
+resume:
 	/* roll back DMAGET and start up channel again */
 	host1x_hw_cdma_resume(host1x, cdma, restart_addr);
 }
@@ -490,6 +526,16 @@ int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job)
 	mutex_lock(&cdma->lock);
+	/*
+	 * Check if syncpoint was locked due to previous job timeout.
+	 * This needs to be done within the cdma lock to avoid a race
+	 * with the timeout handler.
+	 */
+	if (job->syncpt->locked) {
+		mutex_unlock(&cdma->lock);
+		return -EPERM;
+	}
 	if (job->timeout) {
 		/* init state on first submit with timeout value */
 		if (!cdma->timeout.initialized) {

--- a/drivers/gpu/host1x/fence.c
+++ b/drivers/gpu/host1x/fence.c
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Syncpoint dma_fence implementation
+ *
+ * Copyright (c) 2020, NVIDIA Corporation.
+ */
+#include <linux/dma-fence.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sync_file.h>
+#include "fence.h"
+#include "intr.h"
+#include "syncpt.h"
+DEFINE_SPINLOCK(lock);
+struct host1x_syncpt_fence {
+	struct dma_fence base;
+	atomic_t signaling;
+	struct host1x_syncpt *sp;
+	u32 threshold;
+	struct host1x_waitlist *waiter;
+	void *waiter_ref;
+	struct delayed_work timeout_work;
+};
+static const char *host1x_syncpt_fence_get_driver_name(struct dma_fence *f)
+{
+	return "host1x";
+}
+static const char *host1x_syncpt_fence_get_timeline_name(struct dma_fence *f)
+{
+	return "syncpoint";
+}
+static struct host1x_syncpt_fence *to_host1x_fence(struct dma_fence *f)
+{
+	return container_of(f, struct host1x_syncpt_fence, base);
+}
+static bool host1x_syncpt_fence_enable_signaling(struct dma_fence *f)
+{
+	struct host1x_syncpt_fence *sf = to_host1x_fence(f);
+	int err;
+	if (host1x_syncpt_is_expired(sf->sp, sf->threshold))
+		return false;
+	dma_fence_get(f);
+	/*
+	 * The dma_fence framework requires the fence driver to keep a
+	 * reference to any fences for which 'enable_signaling' has been
+	 * called (and that have not been signalled).
+	 *
+	 * We provide a userspace API to create arbitrary syncpoint fences,
+	 * so we cannot normally guarantee that all fences get signalled.
+	 * As such, setup a timeout, so that long-lasting fences will get
+	 * reaped eventually.
+	 */
+	schedule_delayed_work(&sf->timeout_work, msecs_to_jiffies(30000));
+	err = host1x_intr_add_action(sf->sp->host, sf->sp, sf->threshold,
+				     HOST1X_INTR_ACTION_SIGNAL_FENCE, f,
+				     sf->waiter, &sf->waiter_ref);
+	if (err) {
+		cancel_delayed_work_sync(&sf->timeout_work);
+		dma_fence_put(f);
+		return false;
+	}
+	/* intr framework takes ownership of waiter */
+	sf->waiter = NULL;
+	/*
+	 * The fence may get signalled at any time after the above call,
+	 * so we need to initialize all state used by signalling
+	 * before it.
+	 */
+	return true;
+}
+static void host1x_syncpt_fence_release(struct dma_fence *f)
+{
+	struct host1x_syncpt_fence *sf = to_host1x_fence(f);
+	if (sf->waiter)
+		kfree(sf->waiter);
+	dma_fence_free(f);
+}
+const struct dma_fence_ops host1x_syncpt_fence_ops = {
+	.get_driver_name = host1x_syncpt_fence_get_driver_name,
+	.get_timeline_name = host1x_syncpt_fence_get_timeline_name,
+	.enable_signaling = host1x_syncpt_fence_enable_signaling,
+	.release = host1x_syncpt_fence_release,
+};
+void host1x_fence_signal(struct host1x_syncpt_fence *f)
+{
+	if (atomic_xchg(&f->signaling, 1))
+		return;
+	/*
+	 * Cancel pending timeout work - if it races, it will
+	 * not get 'f->signaling' and return.
+	 */
+	cancel_delayed_work_sync(&f->timeout_work);
+	host1x_intr_put_ref(f->sp->host, f->sp->id, f->waiter_ref, false);
+	dma_fence_signal(&f->base);
+	dma_fence_put(&f->base);
+}
+static void do_fence_timeout(struct work_struct *work)
+{
+	struct delayed_work *dwork = (struct delayed_work *)work;
+	struct host1x_syncpt_fence *f =
+		container_of(dwork, struct host1x_syncpt_fence, timeout_work);
+	if (atomic_xchg(&f->signaling, 1))
+		return;
+	/*
+	 * Cancel pending timeout work - if it races, it will
+	 * not get 'f->signaling' and return.
+	 */
+	host1x_intr_put_ref(f->sp->host, f->sp->id, f->waiter_ref, true);
+	dma_fence_set_error(&f->base, -ETIMEDOUT);
+	dma_fence_signal(&f->base);
+	dma_fence_put(&f->base);
+}
+struct dma_fence *host1x_fence_create(struct host1x_syncpt *sp, u32 threshold)
+{
+	struct host1x_syncpt_fence *fence;
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return ERR_PTR(-ENOMEM);
+	fence->waiter = kzalloc(sizeof(*fence->waiter), GFP_KERNEL);
+	if (!fence->waiter)
+		return ERR_PTR(-ENOMEM);
+	fence->sp = sp;
+	fence->threshold = threshold;
+	dma_fence_init(&fence->base, &host1x_syncpt_fence_ops, &lock,
+		       dma_fence_context_alloc(1), 0);
+	INIT_DELAYED_WORK(&fence->timeout_work, do_fence_timeout);
+	return &fence->base;
+}
+EXPORT_SYMBOL(host1x_fence_create);
--- a/drivers/gpu/host1x/fence.h
+++ b/drivers/gpu/host1x/fence.h
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020, NVIDIA Corporation.
+ */
+#ifndef HOST1X_FENCE_H
+#define HOST1X_FENCE_H
+struct host1x_syncpt_fence;
+void host1x_fence_signal(struct host1x_syncpt_fence *fence);
+#endif
--- a/drivers/gpu/host1x/hw/channel_hw.c
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -47,16 +47,60 @@ static void trace_write_gather(struct host1x_cdma *cdma, struct host1x_bo *bo,
 	}
 }
-static void submit_gathers(struct host1x_job *job)
+static void submit_wait(struct host1x_cdma *cdma, u32 id, u32 threshold,
+			u32 next_class)
+{
+#if HOST1X_HW >= 2
+	host1x_cdma_push_wide(cdma,
+		host1x_opcode_setclass(
+			HOST1X_CLASS_HOST1X,
+			HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32,
+			/* WAIT_SYNCPT_32 is at SYNCPT_PAYLOAD_32+2 */
+			BIT(0) | BIT(2)
+		),
+		threshold,
+		id,
+		host1x_opcode_setclass(next_class, 0, 0)
+	);
+#else
+	/* TODO add waitchk or use waitbases or other mitigation */
+	host1x_cdma_push(cdma,
+		host1x_opcode_setclass(
+			HOST1X_CLASS_HOST1X,
+			host1x_uclass_wait_syncpt_r(),
+			BIT(0)
+		),
+		host1x_class_host_wait_syncpt(id, threshold)
+	);
+	host1x_cdma_push(cdma,
+		host1x_opcode_setclass(next_class, 0, 0),
+		HOST1X_OPCODE_NOP
+	);
+#endif
+}
+static void submit_gathers(struct host1x_job *job, u32 job_syncpt_base)
 {
 	struct host1x_cdma *cdma = &job->channel->cdma;
 #if HOST1X_HW < 6
 	struct device *dev = job->channel->dev;
 #endif
 	unsigned int i;
+	u32 threshold;
+	for (i = 0; i < job->num_cmds; i++) {
+		struct host1x_job_cmd *cmd = &job->cmds[i];
+		if (cmd->is_wait) {
+			if (cmd->wait.relative)
+				threshold = job_syncpt_base + cmd->wait.threshold;
+			else
+				threshold = cmd->wait.threshold;
+			submit_wait(cdma, cmd->wait.id, threshold, cmd->wait.next_class);
+		} else {
+			struct host1x_job_gather *g = &cmd->gather;
-	for (i = 0; i < job->num_gathers; i++) {
-		struct host1x_job_gather *g = &job->gathers[i];
 			dma_addr_t addr = g->base + g->offset;
 			u32 op2, op3;
@@ -82,6 +126,7 @@ static void submit_gathers(struct host1x_job *job)
 				host1x_cdma_push(cdma, op1, op2);
 			}
 		}
+	}
 }
 static inline void synchronize_syncpt_base(struct host1x_job *job)
@@ -126,7 +171,7 @@ static int channel_submit(struct host1x_job *job)
 	struct host1x *host = dev_get_drvdata(ch->dev->parent);
 	trace_host1x_channel_submit(dev_name(ch->dev),
-				    job->num_gathers, job->num_relocs,
+				    job->num_cmds, job->num_relocs,
 				    job->syncpt->id, job->syncpt_incrs);
 	/* before error checks, return current max */
@@ -181,7 +226,7 @@ static int channel_submit(struct host1x_job *job)
 				 host1x_opcode_setclass(job->class, 0, 0),
 				 HOST1X_OPCODE_NOP);
-	submit_gathers(job);
+	submit_gathers(job, syncval - user_syncpt_incrs);
 	/* end CDMA submit & stash pinned hMems into sync queue */
 	host1x_cdma_end(&ch->cdma, job);
@@ -191,7 +236,7 @@ static int channel_submit(struct host1x_job *job)
 	/* schedule a submit complete interrupt */
 	err = host1x_intr_add_action(host, sp, syncval,
 				     HOST1X_INTR_ACTION_SUBMIT_COMPLETE, ch,
-				     completed_waiter, NULL);
+				     completed_waiter, &job->waiter);
 	completed_waiter = NULL;
 	WARN(err, "Failed to set submit complete interrupt");

--- a/drivers/gpu/host1x/hw/debug_hw.c
+++ b/drivers/gpu/host1x/hw/debug_hw.c
@@ -156,9 +156,9 @@ static unsigned int show_channel_command(struct output *o, u32 val,
 	}
 }
-static void show_gather(struct output *o, phys_addr_t phys_addr,
+static void show_gather(struct output *o, dma_addr_t phys_addr,
 			unsigned int words, struct host1x_cdma *cdma,
-			phys_addr_t pin_addr, u32 *map_addr)
+			dma_addr_t pin_addr, u32 *map_addr)
 {
 	/* Map dmaget cursor to corresponding mem handle */
 	u32 offset = phys_addr - pin_addr;
@@ -176,11 +176,11 @@ static void show_gather(struct output *o, phys_addr_t phys_addr,
 	}
 	for (i = 0; i < words; i++) {
-		u32 addr = phys_addr + i * 4;
+		dma_addr_t addr = phys_addr + i * 4;
 		u32 val = *(map_addr + offset / 4 + i);
 		if (!data_count) {
-			host1x_debug_output(o, "%08x: %08x: ", addr, val);
+			host1x_debug_output(o, "    %pad: %08x: ", &addr, val);
 			data_count = show_channel_command(o, val, &payload);
 		} else {
 			host1x_debug_cont(o, "%08x%s", val,
@@ -195,23 +195,25 @@ static void show_channel_gathers(struct output *o, struct host1x_cdma *cdma)
 	struct push_buffer *pb = &cdma->push_buffer;
 	struct host1x_job *job;
-	host1x_debug_output(o, "PUSHBUF at %pad, %u words\n",
-			    &pb->dma, pb->size / 4);
-	show_gather(o, pb->dma, pb->size / 4, cdma, pb->dma, pb->mapped);
 	list_for_each_entry(job, &cdma->sync_queue, list) {
 		unsigned int i;
-		host1x_debug_output(o, "\n%p: JOB, syncpt_id=%d, syncpt_val=%d, first_get=%08x, timeout=%d num_slots=%d, num_handles=%d\n",
+		host1x_debug_output(o, "JOB, syncpt %u: %u timeout: %u num_slots: %u num_handles: %u\n",
-				    job, job->syncpt->id, job->syncpt_end,
+				    job->syncpt->id, job->syncpt_end, job->timeout,
-				    job->first_get, job->timeout,
 				    job->num_slots, job->num_unpins);
-		for (i = 0; i < job->num_gathers; i++) {
+		show_gather(o, pb->dma + job->first_get, job->num_slots * 2, cdma,
-			struct host1x_job_gather *g = &job->gathers[i];
+			    pb->dma + job->first_get, pb->mapped + job->first_get);
+		for (i = 0; i < job->num_cmds; i++) {
+			struct host1x_job_gather *g;
 			u32 *mapped;
+			if (job->cmds[i].is_wait)
+				continue;
+			g = &job->cmds[i].gather;
 			if (job->gather_copy_mapped)
 				mapped = (u32 *)job->gather_copy_mapped;
 			else

--- a/drivers/gpu/host1x/hw/debug_hw_1x01.c
+++ b/drivers/gpu/host1x/hw/debug_hw_1x01.c
@@ -16,10 +16,13 @@ static void host1x_debug_show_channel_cdma(struct host1x *host,
 					   struct output *o)
 {
 	struct host1x_cdma *cdma = &ch->cdma;
+	dma_addr_t dmastart, dmaend;
 	u32 dmaput, dmaget, dmactrl;
 	u32 cbstat, cbread;
 	u32 val, base, baseval;
+	dmastart = host1x_ch_readl(ch, HOST1X_CHANNEL_DMASTART);
+	dmaend = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAEND);
 	dmaput = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT);
 	dmaget = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET);
 	dmactrl = host1x_ch_readl(ch, HOST1X_CHANNEL_DMACTRL);
@@ -56,9 +59,10 @@ static void host1x_debug_show_channel_cdma(struct host1x *host,
 				    HOST1X_SYNC_CBSTAT_CBOFFSET_V(cbstat),
 				    cbread);
-	host1x_debug_output(o, "DMAPUT %08x, DMAGET %08x, DMACTL %08x\n",
+	host1x_debug_output(o, "DMASTART %pad, DMAEND %pad\n", &dmastart, &dmaend);
+	host1x_debug_output(o, "DMAPUT %08x DMAGET %08x DMACTL %08x\n",
 			    dmaput, dmaget, dmactrl);
-	host1x_debug_output(o, "CBREAD %08x, CBSTAT %08x\n", cbread, cbstat);
+	host1x_debug_output(o, "CBREAD %08x CBSTAT %08x\n", cbread, cbstat);
 	show_channel_gathers(o, cdma);
 	host1x_debug_output(o, "\n");

--- a/drivers/gpu/host1x/hw/debug_hw_1x06.c
+++ b/drivers/gpu/host1x/hw/debug_hw_1x06.c
@@ -16,10 +16,23 @@ static void host1x_debug_show_channel_cdma(struct host1x *host,
 					   struct output *o)
 {
 	struct host1x_cdma *cdma = &ch->cdma;
+	dma_addr_t dmastart = 0, dmaend = 0;
 	u32 dmaput, dmaget, dmactrl;
 	u32 offset, class;
 	u32 ch_stat;
+#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && HOST1X_HW >= 6
+	dmastart = host1x_ch_readl(ch, HOST1X_CHANNEL_DMASTART_HI);
+	dmastart <<= 32;
+#endif
+	dmastart |= host1x_ch_readl(ch, HOST1X_CHANNEL_DMASTART);
+#if defined(CONFIG_ARCH_DMA_ADDR_T_64BIT) && HOST1X_HW >= 6
+	dmaend = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAEND_HI);
+	dmaend <<= 32;
+#endif
+	dmaend |= host1x_ch_readl(ch, HOST1X_CHANNEL_DMAEND);
 	dmaput = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT);
 	dmaget = host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET);
 	dmactrl = host1x_ch_readl(ch, HOST1X_CHANNEL_DMACTRL);
@@ -41,7 +54,8 @@ static void host1x_debug_show_channel_cdma(struct host1x *host,
 		host1x_debug_output(o, "active class %02x, offset %04x\n",
 				    class, offset);
-	host1x_debug_output(o, "DMAPUT %08x, DMAGET %08x, DMACTL %08x\n",
+	host1x_debug_output(o, "DMASTART %pad, DMAEND %pad\n", &dmastart, &dmaend);
+	host1x_debug_output(o, "DMAPUT %08x DMAGET %08x DMACTL %08x\n",
 			    dmaput, dmaget, dmactrl);
 	host1x_debug_output(o, "CHANNELSTAT %02x\n", ch_stat);

--- a/drivers/gpu/host1x/hw/hw_host1x02_uclass.h
+++ b/drivers/gpu/host1x/hw/hw_host1x02_uclass.h
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
 	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+	return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+	host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+	return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+	host1x_uclass_wait_syncpt_32_r()
 #endif
--- a/drivers/gpu/host1x/hw/hw_host1x04_uclass.h
+++ b/drivers/gpu/host1x/hw/hw_host1x04_uclass.h
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
 	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+	return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+	host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+	return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+	host1x_uclass_wait_syncpt_32_r()
 #endif
--- a/drivers/gpu/host1x/hw/hw_host1x05_uclass.h
+++ b/drivers/gpu/host1x/hw/hw_host1x05_uclass.h
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
 	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+	return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+	host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+	return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+	host1x_uclass_wait_syncpt_32_r()
 #endif
--- a/drivers/gpu/host1x/hw/hw_host1x06_uclass.h
+++ b/drivers/gpu/host1x/hw/hw_host1x06_uclass.h
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
 	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+	return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+	host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+	return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+	host1x_uclass_wait_syncpt_32_r()
 #endif
--- a/drivers/gpu/host1x/hw/hw_host1x07_uclass.h
+++ b/drivers/gpu/host1x/hw/hw_host1x07_uclass.h
@@ -165,5 +165,17 @@ static inline u32 host1x_uclass_indoff_rwn_read_v(void)
 }
 #define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
 	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_load_syncpt_payload_32_r(void)
+{
+	return 0x4e;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_PAYLOAD_32 \
+	host1x_uclass_load_syncpt_payload_32_r()
+static inline u32 host1x_uclass_wait_syncpt_32_r(void)
+{
+	return 0x50;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_32 \
+	host1x_uclass_wait_syncpt_32_r()
 #endif
--- a/drivers/gpu/host1x/intr.c
+++ b/drivers/gpu/host1x/intr.c
@@ -13,6 +13,7 @@
 #include <trace/events/host1x.h>
 #include "channel.h"
 #include "dev.h"
+#include "fence.h"
 #include "intr.h"
 /* Wait list management */
@@ -121,12 +122,20 @@ static void action_wakeup_interruptible(struct host1x_waitlist *waiter)
 	wake_up_interruptible(wq);
 }
+static void action_signal_fence(struct host1x_waitlist *waiter)
+{
+	struct host1x_syncpt_fence *f = waiter->data;
+	host1x_fence_signal(f);
+}
 typedef void (*action_handler)(struct host1x_waitlist *waiter);
 static const action_handler action_handlers[HOST1X_INTR_ACTION_COUNT] = {
 	action_submit_complete,
 	action_wakeup,
 	action_wakeup_interruptible,
+	action_signal_fence,
 };
 static void run_handlers(struct list_head completed[HOST1X_INTR_ACTION_COUNT])

--- a/drivers/gpu/host1x/intr.h
+++ b/drivers/gpu/host1x/intr.h
@@ -33,6 +33,8 @@ enum host1x_intr_action {
 	 */
 	HOST1X_INTR_ACTION_WAKEUP_INTERRUPTIBLE,
+	HOST1X_INTR_ACTION_SIGNAL_FENCE,
 	HOST1X_INTR_ACTION_COUNT
 };

--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -24,21 +24,25 @@
 #define HOST1X_WAIT_SYNCPT_OFFSET 0x8
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
-				    u32 num_cmdbufs, u32 num_relocs)
+				    u32 num_cmdbufs, u32 num_relocs,
+				    bool skip_firewall)
 {
 	struct host1x_job *job = NULL;
 	unsigned int num_unpins = num_relocs;
+	bool enable_firewall;
 	u64 total;
 	void *mem;
-	if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))
+	enable_firewall = IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) && !skip_firewall;
+	if (!enable_firewall)
 		num_unpins += num_cmdbufs;
 	/* Check that we're not going to overflow */
 	total = sizeof(struct host1x_job) +
 		(u64)num_relocs * sizeof(struct host1x_reloc) +
 		(u64)num_unpins * sizeof(struct host1x_job_unpin_data) +
-		(u64)num_cmdbufs * sizeof(struct host1x_job_gather) +
+		(u64)num_cmdbufs * sizeof(struct host1x_job_cmd) +
 		(u64)num_unpins * sizeof(dma_addr_t) +
 		(u64)num_unpins * sizeof(u32 *);
 	if (total > ULONG_MAX)
@@ -48,6 +52,8 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 	if (!job)
 		return NULL;
+	job->enable_firewall = enable_firewall;
 	kref_init(&job->ref);
 	job->channel = ch;
@@ -57,8 +63,8 @@ struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
 	mem += num_relocs * sizeof(struct host1x_reloc);
 	job->unpins = num_unpins ? mem : NULL;
 	mem += num_unpins * sizeof(struct host1x_job_unpin_data);
-	job->gathers = num_cmdbufs ? mem : NULL;
+	job->cmds = num_cmdbufs ? mem : NULL;
-	mem += num_cmdbufs * sizeof(struct host1x_job_gather);
+	mem += num_cmdbufs * sizeof(struct host1x_job_cmd);
 	job->addr_phys = num_unpins ? mem : NULL;
 	job->reloc_addr_phys = job->addr_phys;
@@ -79,6 +85,13 @@ static void job_free(struct kref *ref)
 {
 	struct host1x_job *job = container_of(ref, struct host1x_job, ref);
+	if (job->release)
+		job->release(job);
+	if (job->waiter)
+		host1x_intr_put_ref(job->syncpt->host, job->syncpt->id,
+				    job->waiter, false);
 	if (job->syncpt)
 		host1x_syncpt_put(job->syncpt);
@@ -94,22 +107,38 @@ EXPORT_SYMBOL(host1x_job_put);
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
 			   unsigned int words, unsigned int offset)
 {
-	struct host1x_job_gather *gather = &job->gathers[job->num_gathers];
+	struct host1x_job_gather *gather = &job->cmds[job->num_cmds].gather;
 	gather->words = words;
 	gather->bo = bo;
 	gather->offset = offset;
-	job->num_gathers++;
+	job->num_cmds++;
 }
 EXPORT_SYMBOL(host1x_job_add_gather);
+void host1x_job_add_wait(struct host1x_job *job, u32 id, u32 thresh,
+			 bool relative, u32 next_class)
+{
+	struct host1x_job_cmd *cmd = &job->cmds[job->num_cmds];
+	cmd->is_wait = true;
+	cmd->wait.id = id;
+	cmd->wait.threshold = thresh;
+	cmd->wait.next_class = next_class;
+	cmd->wait.relative = relative;
+	job->num_cmds++;
+}
+EXPORT_SYMBOL(host1x_job_add_wait);
 static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 {
 	struct host1x_client *client = job->client;
 	struct device *dev = client->dev;
 	struct host1x_job_gather *g;
 	struct iommu_domain *domain;
+	struct sg_table *sgt;
 	unsigned int i;
 	int err;
@@ -119,7 +148,6 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 	for (i = 0; i < job->num_relocs; i++) {
 		struct host1x_reloc *reloc = &job->relocs[i];
 		dma_addr_t phys_addr, *phys;
-		struct sg_table *sgt;
 		reloc->target.bo = host1x_bo_get(reloc->target.bo);
 		if (!reloc->target.bo) {
@@ -192,20 +220,23 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 	 * We will copy gathers BO content later, so there is no need to
 	 * hold and pin them.
 	 */
-	if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))
+	if (job->enable_firewall)
 		return 0;
-	for (i = 0; i < job->num_gathers; i++) {
+	for (i = 0; i < job->num_cmds; i++) {
 		size_t gather_size = 0;
 		struct scatterlist *sg;
-		struct sg_table *sgt;
 		dma_addr_t phys_addr;
 		unsigned long shift;
 		struct iova *alloc;
 		dma_addr_t *phys;
 		unsigned int j;
-		g = &job->gathers[i];
+		if (job->cmds[i].is_wait)
+			continue;
+		g = &job->cmds[i].gather;
 		g->bo = host1x_bo_get(g->bo);
 		if (!g->bo) {
 			err = -EINVAL;
@@ -296,7 +327,7 @@ static int do_relocs(struct host1x_job *job, struct host1x_job_gather *g)
 		if (cmdbuf != reloc->cmdbuf.bo)
 			continue;
-		if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL)) {
+		if (job->enable_firewall) {
 			target = (u32 *)job->gather_copy_mapped +
 					reloc->cmdbuf.offset / sizeof(u32) +
 						g->offset / sizeof(u32);
@@ -538,8 +569,13 @@ static inline int copy_gathers(struct device *host, struct host1x_job *job,
 	fw.num_relocs = job->num_relocs;
 	fw.class = job->class;
-	for (i = 0; i < job->num_gathers; i++) {
+	for (i = 0; i < job->num_cmds; i++) {
-		struct host1x_job_gather *g = &job->gathers[i];
+		struct host1x_job_gather *g;
+		if (job->cmds[i].is_wait)
+			continue;
+		g = &job->cmds[i].gather;
 		size += g->words * sizeof(u32);
 	}
@@ -561,10 +597,14 @@ static inline int copy_gathers(struct device *host, struct host1x_job *job,
 	job->gather_copy_size = size;
-	for (i = 0; i < job->num_gathers; i++) {
+	for (i = 0; i < job->num_cmds; i++) {
-		struct host1x_job_gather *g = &job->gathers[i];
+		struct host1x_job_gather *g;
 		void *gather;
+		if (job->cmds[i].is_wait)
+			continue;
+		g = &job->cmds[i].gather;
 		/* Copy the gather */
 		gather = host1x_bo_mmap(g->bo);
 		memcpy(job->gather_copy_mapped + offset, gather + g->offset,
@@ -600,28 +640,33 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev)
 	if (err)
 		goto out;
-	if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL)) {
+	if (job->enable_firewall) {
 		err = copy_gathers(host->dev, job, dev);
 		if (err)
 			goto out;
 	}
 	/* patch gathers */
-	for (i = 0; i < job->num_gathers; i++) {
+	for (i = 0; i < job->num_cmds; i++) {
-		struct host1x_job_gather *g = &job->gathers[i];
+		struct host1x_job_gather *g;
+		if (job->cmds[i].is_wait)
+			continue;
+		g = &job->cmds[i].gather;
 		/* process each gather mem only once */
 		if (g->handled)
 			continue;
 		/* copy_gathers() sets gathers base if firewall is enabled */
-		if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL))
+		if (!job->enable_firewall)
 			g->base = job->gather_addr_phys[i];
-		for (j = i + 1; j < job->num_gathers; j++) {
+		for (j = i + 1; j < job->num_cmds; j++) {
-			if (job->gathers[j].bo == g->bo) {
+			if (!job->cmds[j].is_wait &&
-				job->gathers[j].handled = true;
+			    job->cmds[j].gather.bo == g->bo) {
-				job->gathers[j].base = g->base;
+				job->cmds[j].gather.handled = true;
+				job->cmds[j].gather.base = g->base;
 			}
 		}
@@ -649,8 +694,7 @@ void host1x_job_unpin(struct host1x_job *job)
 		struct device *dev = unpin->dev ?: host->dev;
 		struct sg_table *sgt = unpin->sgt;
-		if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) &&
+		if (!job->enable_firewall && unpin->size && host->domain) {
-		    unpin->size && host->domain) {
 			iommu_unmap(host->domain, job->addr_phys[i],
 				    unpin->size);
 			free_iova(&host->iova,

--- a/drivers/gpu/host1x/job.h
+++ b/drivers/gpu/host1x/job.h
@@ -18,6 +18,22 @@ struct host1x_job_gather {
 	bool handled;
 };
+struct host1x_job_wait {
+	u32 id;
+	u32 threshold;
+	u32 next_class;
+	bool relative;
+};
+struct host1x_job_cmd {
+	bool is_wait;
+	union {
+		struct host1x_job_gather gather;
+		struct host1x_job_wait wait;
+	};
+};
 struct host1x_job_unpin_data {
 	struct host1x_bo *bo;
 	struct sg_table *sgt;

--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -407,6 +407,8 @@ static void syncpt_release(struct kref *ref)
 	atomic_set(&sp->max_val, host1x_syncpt_read(sp));
+	sp->locked = false;
 	mutex_lock(&sp->host->syncpt_mutex);
 	host1x_syncpt_base_free(sp->base);

--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -40,6 +40,13 @@ struct host1x_syncpt {
 	/* interrupt data */
 	struct host1x_syncpt_intr intr;
+	/*
+	 * If a submission incrementing this syncpoint fails, lock it so that
+	 * further submission cannot be made until application has handled the
+	 * failure.
+	 */
+	bool locked;
 };
 /* Initialize sync point array  */
@@ -115,4 +122,9 @@ static inline int host1x_syncpt_is_valid(struct host1x_syncpt *sp)
 	return sp->id < host1x_syncpt_nb_pts(sp->host);
 }
+static inline void host1x_syncpt_set_locked(struct host1x_syncpt *sp)
+{
+	sp->locked = true;
+}
 #endif
--- a/include/linux/host1x.h
+++ b/include/linux/host1x.h
@@ -170,6 +170,8 @@ u32 host1x_syncpt_base_id(struct host1x_syncpt_base *base);
 void host1x_syncpt_release_vblank_reservation(struct host1x_client *client,
 					      u32 syncpt_id);
+struct dma_fence *host1x_fence_create(struct host1x_syncpt *sp, u32 threshold);
 /*
 * host1x channel
 */
@@ -216,8 +218,8 @@ struct host1x_job {
 	struct host1x_client *client;
 	/* Gathers and their memory */
-	struct host1x_job_gather *gathers;
+	struct host1x_job_cmd *cmds;
-	unsigned int num_gathers;
+	unsigned int num_cmds;
 	/* Array of handles to be pinned & unpinned */
 	struct host1x_reloc *relocs;
@@ -234,9 +236,15 @@ struct host1x_job {
 	u32 syncpt_incrs;
 	u32 syncpt_end;
+	/* Completion waiter ref */
+	void *waiter;
 	/* Maximum time to wait for this job */
 	unsigned int timeout;
+	/* Job has timed out and should be released */
+	bool cancelled;
 	/* Index and number of slots used in the push buffer */
 	unsigned int first_get;
 	unsigned int num_slots;
@@ -257,12 +265,25 @@ struct host1x_job {
 	/* Add a channel wait for previous ops to complete */
 	bool serialize;
+	/* Fast-forward syncpoint increments on job timeout */
+	bool syncpt_recovery;
+	/* Callback called when job is freed */
+	void (*release)(struct host1x_job *job);
+	void *user_data;
+	/* Whether host1x-side firewall should be ran for this job or not */
+	bool enable_firewall;
 };
 struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
-				    u32 num_cmdbufs, u32 num_relocs);
+				    u32 num_cmdbufs, u32 num_relocs,
+				    bool skip_firewall);
 void host1x_job_add_gather(struct host1x_job *job, struct host1x_bo *bo,
 			   unsigned int words, unsigned int offset);
+void host1x_job_add_wait(struct host1x_job *job, u32 id, u32 thresh,
+			 bool relative, u32 next_class);
 struct host1x_job *host1x_job_get(struct host1x_job *job);
 void host1x_job_put(struct host1x_job *job);
 int host1x_job_pin(struct host1x_job *job, struct device *dev);

--- a/include/uapi/drm/tegra_drm.h
+++ b/include/uapi/drm/tegra_drm.h
-/*
+/* SPDX-License-Identifier: MIT */
- * Copyright (c) 2012-2013, NVIDIA CORPORATION.  All rights reserved.
+/* Copyright (c) 2012-2020 NVIDIA Corporation */
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
 #ifndef _UAPI_TEGRA_DRM_H_
 #define _UAPI_TEGRA_DRM_H_
@@ -29,6 +10,8 @@
 extern "C" {
 #endif
+/* Tegra DRM legacy UAPI. Only enabled with STAGING */
 #define DRM_TEGRA_GEM_CREATE_TILED     (1 << 0)
 #define DRM_TEGRA_GEM_CREATE_BOTTOM_UP (1 << 1)
@@ -674,6 +657,402 @@ struct drm_tegra_gem_get_flags {
 #define DRM_IOCTL_TEGRA_GEM_SET_FLAGS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GEM_SET_FLAGS, struct drm_tegra_gem_set_flags)
 #define DRM_IOCTL_TEGRA_GEM_GET_FLAGS DRM_IOWR(DRM_COMMAND_BASE + DRM_TEGRA_GEM_GET_FLAGS, struct drm_tegra_gem_get_flags)
+/* New Tegra DRM UAPI */
+/*
+ * Reported by the driver in the `capabilities` field.
+ *
+ * DRM_TEGRA_CHANNEL_CAP_CACHE_COHERENT: If set, the engine is cache coherent
+ * with regard to the system memory.
+ */
+#define DRM_TEGRA_CHANNEL_CAP_CACHE_COHERENT (1 << 0)
+struct drm_tegra_channel_open {
+	/**
+	 * @host1x_class: [in]
+	 *
+	 * Host1x class of the engine that will be programmed using this
+	 * channel.
+	 */
+	__u32 host1x_class;
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+	/**
+	 * @context: [out]
+	 *
+	 * Opaque identifier corresponding to the opened channel.
+	 */
+	__u32 context;
+	/**
+	 * @version: [out]
+	 *
+	 * Version of the engine hardware. This can be used by userspace
+	 * to determine how the engine needs to be programmed.
+	 */
+	__u32 version;
+	/**
+	 * @capabilities: [out]
+	 *
+	 * Flags describing the hardware capabilities.
+	 */
+	__u32 capabilities;
+	__u32 padding;
+};
+struct drm_tegra_channel_close {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to close.
+	 */
+	__u32 context;
+	__u32 padding;
+};
+/*
+ * Mapping flags that can be used to influence how the mapping is created.
+ *
+ * DRM_TEGRA_CHANNEL_MAP_READ: create mapping that allows HW read access
+ * DRM_TEGRA_CHANNEL_MAP_WRITE: create mapping that allows HW write access
+ */
+#define DRM_TEGRA_CHANNEL_MAP_READ  (1 << 0)
+#define DRM_TEGRA_CHANNEL_MAP_WRITE (1 << 1)
+#define DRM_TEGRA_CHANNEL_MAP_READ_WRITE (DRM_TEGRA_CHANNEL_MAP_READ | \
+					  DRM_TEGRA_CHANNEL_MAP_WRITE)
+struct drm_tegra_channel_map {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to which make memory available for.
+	 */
+	__u32 context;
+	/**
+	 * @handle: [in]
+	 *
+	 * GEM handle of the memory to map.
+	 */
+	__u32 handle;
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+	/**
+	 * @mapping: [out]
+	 *
+	 * Identifier corresponding to the mapping, to be used for
+	 * relocations or unmapping later.
+	 */
+	__u32 mapping;
+};
+struct drm_tegra_channel_unmap {
+	/**
+	 * @context: [in]
+	 *
+	 * Channel identifier of the channel to unmap memory from.
+	 */
+	__u32 context;
+	/**
+	 * @mapping: [in]
+	 *
+	 * Mapping identifier of the memory mapping to unmap.
+	 */
+	__u32 mapping;
+};
+/* Submission */
+/**
+ * Specify that bit 39 of the patched-in address should be set to switch
+ * swizzling between Tegra and non-Tegra sector layout on systems that store
+ * surfaces in system memory in non-Tegra sector layout.
+ */
+#define DRM_TEGRA_SUBMIT_RELOC_SECTOR_LAYOUT (1 << 0)
+struct drm_tegra_submit_buf {
+	/**
+	 * @mapping: [in]
+	 *
+	 * Identifier of the mapping to use in the submission.
+	 */
+	__u32 mapping;
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+	/**
+	 * Information for relocation patching.
+	 */
+	struct {
+		/**
+		 * @target_offset: [in]
+		 *
+		 * Offset from the start of the mapping of the data whose
+		 * address is to be patched into the gather.
+		 */
+		__u64 target_offset;
+		/**
+		 * @gather_offset_words: [in]
+		 *
+		 * Offset in words from the start of the gather data to
+		 * where the address should be patched into.
+		 */
+		__u32 gather_offset_words;
+		/**
+		 * @shift: [in]
+		 *
+		 * Number of bits the address should be shifted right before
+		 * patching in.
+		 */
+		__u32 shift;
+	} reloc;
+};
+/**
+ * Execute `words` words of Host1x opcodes specified in the `gather_data_ptr`
+ * buffer. Each GATHER_UPTR command uses successive words from the buffer.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_GATHER_UPTR		0
+/**
+ * Wait for a syncpoint to reach a value before continuing with further
+ * commands.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT		1
+/**
+ * Wait for a syncpoint to reach a value before continuing with further
+ * commands. The threshold is calculated relative to the start of the job.
+ */
+#define DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT_RELATIVE	2
+struct drm_tegra_submit_cmd_gather_uptr {
+	__u32 words;
+	__u32 reserved[3];
+};
+struct drm_tegra_submit_cmd_wait_syncpt {
+	__u32 id;
+	__u32 value;
+	__u32 reserved[2];
+};
+struct drm_tegra_submit_cmd {
+	/**
+	 * @type: [in]
+	 *
+	 * Command type to execute. One of the DRM_TEGRA_SUBMIT_CMD*
+	 * defines.
+	 */
+	__u32 type;
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+	union {
+		struct drm_tegra_submit_cmd_gather_uptr gather_uptr;
+		struct drm_tegra_submit_cmd_wait_syncpt wait_syncpt;
+		__u32 reserved[4];
+	};
+};
+struct drm_tegra_submit_syncpt {
+	/**
+	 * @id: [in]
+	 *
+	 * ID of the syncpoint that the job will increment.
+	 */
+	__u32 id;
+	/**
+	 * @flags: [in]
+	 *
+	 * Flags.
+	 */
+	__u32 flags;
+	/**
+	 * @increments: [in]
+	 *
+	 * Number of times the job will increment this syncpoint.
+	 */
+	__u32 increments;
+	/**
+	 * @value: [out]
+	 *
+	 * Value the syncpoint will have once the job has completed all
+	 * its specified syncpoint increments.
+	 *
+	 * Note that the kernel may increment the syncpoint before or after
+	 * the job. These increments are not reflected in this field.
+	 *
+	 * If the job hangs or times out, not all of the increments may
+	 * get executed.
+	 */
+	__u32 value;
+};
+struct drm_tegra_channel_submit {
+	/**
+	 * @context: [in]
+	 *
+	 * Identifier of the channel to submit this job to.
+	 */
+	__u32 context;
+	/**
+	 * @num_bufs: [in]
+	 *
+	 * Number of elements in the `bufs_ptr` array.
+	 */
+	__u32 num_bufs;
+	/**
+	 * @num_cmds: [in]
+	 *
+	 * Number of elements in the `cmds_ptr` array.
+	 */
+	__u32 num_cmds;
+	/**
+	 * @gather_data_words: [in]
+	 *
+	 * Number of 32-bit words in the `gather_data_ptr` array.
+	 */
+	__u32 gather_data_words;
+	/**
+	 * @bufs_ptr: [in]
+	 *
+	 * Pointer to an array of drm_tegra_submit_buf structures.
+	 */
+	__u64 bufs_ptr;
+	/**
+	 * @cmds_ptr: [in]
+	 *
+	 * Pointer to an array of drm_tegra_submit_cmd structures.
+	 */
+	__u64 cmds_ptr;
+	/**
+	 * @gather_data_ptr: [in]
+	 *
+	 * Pointer to an array of Host1x opcodes to be used by GATHER_UPTR
+	 * commands.
+	 */
+	__u64 gather_data_ptr;
+	/**
+	 * @syncobj_in: [in]
+	 *
+	 * Handle for DRM syncobj that will be waited before submission.
+	 * Ignored if zero.
+	 */
+	__u32 syncobj_in;
+	/**
+	 * @syncobj_out: [in]
+	 *
+	 * Handle for DRM syncobj that will have its fence replaced with
+	 * the job's completion fence. Ignored if zero.
+	 */
+	__u32 syncobj_out;
+	/**
+	 * @syncpt_incr: [in,out]
+	 *
+	 * Information about the syncpoint the job will increment.
+	 */
+	struct drm_tegra_submit_syncpt syncpt;
+};
+struct drm_tegra_syncpoint_allocate {
+	/**
+	 * @id: [out]
+	 *
+	 * ID of allocated syncpoint.
+	 */
+	__u32 id;
+	__u32 padding;
+};
+struct drm_tegra_syncpoint_free {
+	/**
+	 * @id: [in]
+	 *
+	 * ID of syncpoint to free.
+	 */
+	__u32 id;
+	__u32 padding;
+};
+struct drm_tegra_syncpoint_wait {
+	/**
+	 * @timeout: [in]
+	 *
+	 * Absolute timestamp at which the wait will time out.
+	 */
+	__s64 timeout_ns;
+	/**
+	 * @id: [in]
+	 *
+	 * ID of syncpoint to wait on.
+	 */
+	__u32 id;
+	/**
+	 * @threshold: [in]
+	 *
+	 * Threshold to wait for.
+	 */
+	__u32 threshold;
+	/**
+	 * @value: [out]
+	 *
+	 * Value of the syncpoint upon wait completion.
+	 */
+	__u32 value;
+	__u32 padding;
+};
+#define DRM_IOCTL_TEGRA_CHANNEL_OPEN DRM_IOWR(DRM_COMMAND_BASE + 0x10, struct drm_tegra_channel_open)
+#define DRM_IOCTL_TEGRA_CHANNEL_CLOSE DRM_IOWR(DRM_COMMAND_BASE + 0x11, struct drm_tegra_channel_close)
+#define DRM_IOCTL_TEGRA_CHANNEL_MAP DRM_IOWR(DRM_COMMAND_BASE + 0x12, struct drm_tegra_channel_map)
+#define DRM_IOCTL_TEGRA_CHANNEL_UNMAP DRM_IOWR(DRM_COMMAND_BASE + 0x13, struct drm_tegra_channel_unmap)
+#define DRM_IOCTL_TEGRA_CHANNEL_SUBMIT DRM_IOWR(DRM_COMMAND_BASE + 0x14, struct drm_tegra_channel_submit)
+#define DRM_IOCTL_TEGRA_SYNCPOINT_ALLOCATE DRM_IOWR(DRM_COMMAND_BASE + 0x20, struct drm_tegra_syncpoint_allocate)
+#define DRM_IOCTL_TEGRA_SYNCPOINT_FREE DRM_IOWR(DRM_COMMAND_BASE + 0x21, struct drm_tegra_syncpoint_free)
+#define DRM_IOCTL_TEGRA_SYNCPOINT_WAIT DRM_IOWR(DRM_COMMAND_BASE + 0x22, struct drm_tegra_syncpoint_wait)
 #if defined(__cplusplus)
 }
 #endif