Commit 421a511a authored by Joao Martins's avatar Joao Martins Committed by Jason Gunthorpe

iommu/amd: Access/Dirty bit support in IOPTEs

IOMMU advertises Access/Dirty bits if the extended feature register reports
it. Relevant AMD IOMMU SDM ref[0] "1.3.8 Enhanced Support for Access and
Dirty Bits"

To enable it set the DTE flag in bits 7 and 8 to enable access, or
access+dirty. With that, the IOMMU starts marking the D and A flags on
every Memory Request or ATS translation request. It is on the VMM side to
steer whether to enable dirty tracking or not, rather than wrongly doing in
IOMMU. Relevant AMD IOMMU SDM ref [0], "Table 7. Device Table Entry (DTE)
Field Definitions" particularly the entry "HAD".

To actually toggle on and off it's relatively simple as it's setting 2 bits
on DTE and flush the device DTE cache.

To get what's dirtied use existing AMD io-pgtable support, by walking the
pagetables over each IOVA, with fetch_pte().  The IOTLB flushing is left to
the caller (much like unmap), and iommu_dirty_bitmap_record() is the one
adding page-ranges to invalidate. This allows caller to batch the flush
over a big span of IOVA space, without the iommu wondering about when to
flush.

Worthwhile sections from AMD IOMMU SDM:

"2.2.3.1 Host Access Support"
"2.2.3.2 Host Dirty Support"

For details on how IOMMU hardware updates the dirty bit see, and expects
from its consequent clearing by CPU:

"2.2.7.4 Updating Accessed and Dirty Bits in the Guest Address Tables"
"2.2.7.5 Clearing Accessed and Dirty Bits"

Quoting the SDM:

"The setting of accessed and dirty status bits in the page tables is
visible to both the CPU and the peripheral when sharing guest page tables.
The IOMMU interlocked operations to update A and D bits must be 64-bit
operations and naturally aligned on a 64-bit boundary"

.. and for the IOMMU update sequence to Dirty bit, essentially is states:

1. Decodes the read and write intent from the memory access.
2. If P=0 in the page descriptor, fail the access.
3. Compare the A & D bits in the descriptor with the read and write
intent in the request.
4. If the A or D bits need to be updated in the descriptor:
* Start atomic operation.
* Read the descriptor as a 64-bit access.
* If the descriptor no longer appears to require an update, release the
atomic lock with
no further action and continue to step 5.
* Calculate the new A & D bits.
* Write the descriptor as a 64-bit access.
* End atomic operation.
5. Continue to the next stage of translation or to the memory access.

Access/Dirty bits readout also need to consider the non-default page-sizes
(aka replicated PTEs as mentined by manual), as AMD supports all powers of
two (except 512G) page sizes.

Select IOMMUFD_DRIVER only if IOMMUFD is enabled considering that IOMMU
dirty tracking requires IOMMUFD.

Link: https://lore.kernel.org/r/20231024135109.73787-12-joao.m.martins@oracle.comSigned-off-by: default avatarJoao Martins <joao.m.martins@oracle.com>
Reviewed-by: default avatarSuravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 13428815
...@@ -10,6 +10,7 @@ config AMD_IOMMU ...@@ -10,6 +10,7 @@ config AMD_IOMMU
select IOMMU_API select IOMMU_API
select IOMMU_IOVA select IOMMU_IOVA
select IOMMU_IO_PGTABLE select IOMMU_IO_PGTABLE
select IOMMUFD_DRIVER if IOMMUFD
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help help
With this option you can enable support for AMD IOMMU hardware in With this option you can enable support for AMD IOMMU hardware in
......
...@@ -97,7 +97,9 @@ ...@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK (3ULL) #define FEATURE_GATS_MASK (3ULL)
#define FEATURE_GAM_VAPIC BIT_ULL(21) #define FEATURE_GAM_VAPIC BIT_ULL(21)
#define FEATURE_GIOSUP BIT_ULL(48) #define FEATURE_GIOSUP BIT_ULL(48)
#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50) #define FEATURE_EPHSUP BIT_ULL(50)
#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63) #define FEATURE_SNP BIT_ULL(63)
#define FEATURE_PASID_SHIFT 32 #define FEATURE_PASID_SHIFT 32
...@@ -212,6 +214,7 @@ ...@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */ /* macros and definitions for device table entries */
#define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_VALID 0x00
#define DEV_ENTRY_TRANSLATION 0x01 #define DEV_ENTRY_TRANSLATION 0x01
#define DEV_ENTRY_HAD 0x07
#define DEV_ENTRY_PPR 0x34 #define DEV_ENTRY_PPR 0x34
#define DEV_ENTRY_IR 0x3d #define DEV_ENTRY_IR 0x3d
#define DEV_ENTRY_IW 0x3e #define DEV_ENTRY_IW 0x3e
...@@ -370,10 +373,16 @@ ...@@ -370,10 +373,16 @@
#define PTE_LEVEL_PAGE_SIZE(level) \ #define PTE_LEVEL_PAGE_SIZE(level) \
(1ULL << (12 + (9 * (level)))) (1ULL << (12 + (9 * (level))))
/*
* The IOPTE dirty bit
*/
#define IOMMU_PTE_HD_BIT (6)
/* /*
* Bit value definition for I/O PTE fields * Bit value definition for I/O PTE fields
*/ */
#define IOMMU_PTE_PR BIT_ULL(0) #define IOMMU_PTE_PR BIT_ULL(0)
#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U BIT_ULL(59) #define IOMMU_PTE_U BIT_ULL(59)
#define IOMMU_PTE_FC BIT_ULL(60) #define IOMMU_PTE_FC BIT_ULL(60)
#define IOMMU_PTE_IR BIT_ULL(61) #define IOMMU_PTE_IR BIT_ULL(61)
...@@ -384,6 +393,7 @@ ...@@ -384,6 +393,7 @@
*/ */
#define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1) #define DTE_FLAG_TV BIT_ULL(1)
#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55) #define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX_SHIFT (56) #define DTE_GLX_SHIFT (56)
...@@ -413,6 +423,7 @@ ...@@ -413,6 +423,7 @@
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
...@@ -563,6 +574,7 @@ struct protection_domain { ...@@ -563,6 +574,7 @@ struct protection_domain {
int nid; /* Node ID */ int nid; /* Node ID */
u64 *gcr3_tbl; /* Guest CR3 table */ u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */ unsigned long flags; /* flags to find out type of domain */
bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
}; };
......
...@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo ...@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
return (__pte & ~offset_mask) | (iova & offset_mask); return (__pte & ~offset_mask) | (iova & offset_mask);
} }
static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
unsigned long flags)
{
bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
bool dirty = false;
int i, count;
/*
* 2.2.3.2 Host Dirty Support
* When a non-default page size is used , software must OR the
* Dirty bits in all of the replicated host PTEs used to map
* the page. The IOMMU does not guarantee the Dirty bits are
* set in all of the replicated PTEs. Any portion of the page
* may have been written even if the Dirty bit is set in only
* one of the replicated PTEs.
*/
count = PAGE_SIZE_PTE_COUNT(size);
for (i = 0; i < count && test_only; i++) {
if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
dirty = true;
break;
}
}
for (i = 0; i < count && !test_only; i++) {
if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
(unsigned long *)&ptep[i])) {
dirty = true;
}
}
return dirty;
}
static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
unsigned long end = iova + size - 1;
do {
unsigned long pgsize = 0;
u64 *ptep, pte;
ptep = fetch_pte(pgtable, iova, &pgsize);
if (ptep)
pte = READ_ONCE(*ptep);
if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
iova += pgsize;
continue;
}
/*
* Mark the whole IOVA range as dirty even if only one of
* the replicated PTEs were marked dirty.
*/
if (pte_test_and_clear_dirty(ptep, pgsize, flags))
iommu_dirty_bitmap_record(dirty, iova, pgsize);
iova += pgsize;
} while (iova < end);
return 0;
}
/* /*
* ---------------------------------------------------- * ----------------------------------------------------
*/ */
...@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo ...@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
pgtable->iop.ops.map_pages = iommu_v1_map_pages; pgtable->iop.ops.map_pages = iommu_v1_map_pages;
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
return &pgtable->iop; return &pgtable->iop;
} }
......
...@@ -66,6 +66,7 @@ LIST_HEAD(hpet_map); ...@@ -66,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map); LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops; const struct iommu_ops amd_iommu_ops;
const struct iommu_dirty_ops amd_dirty_ops;
static ATOMIC_NOTIFIER_HEAD(ppr_notifier); static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1; int amd_iommu_max_glx_val = -1;
...@@ -1611,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, ...@@ -1611,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
pte_root |= 1ULL << DEV_ENTRY_PPR; pte_root |= 1ULL << DEV_ENTRY_PPR;
} }
if (domain->dirty_tracking)
pte_root |= DTE_FLAG_HAD;
if (domain->flags & PD_IOMMUV2_MASK) { if (domain->flags & PD_IOMMUV2_MASK) {
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
u64 glx = domain->glx; u64 glx = domain->glx;
...@@ -2156,9 +2160,15 @@ static inline u64 dma_max_address(void) ...@@ -2156,9 +2160,15 @@ static inline u64 dma_max_address(void)
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
} }
static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
return iommu && (iommu->features & FEATURE_HDSUP);
}
static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
struct device *dev, u32 flags) struct device *dev, u32 flags)
{ {
bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain; struct protection_domain *domain;
struct amd_iommu *iommu = NULL; struct amd_iommu *iommu = NULL;
...@@ -2175,6 +2185,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, ...@@ -2175,6 +2185,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if (dirty_tracking && !amd_iommu_hd_support(iommu))
return ERR_PTR(-EOPNOTSUPP);
domain = protection_domain_alloc(type); domain = protection_domain_alloc(type);
if (!domain) if (!domain)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -2187,6 +2200,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, ...@@ -2187,6 +2200,9 @@ static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
domain->domain.type = type; domain->domain.type = type;
domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
domain->domain.ops = iommu->iommu.ops->default_domain_ops; domain->domain.ops = iommu->iommu.ops->default_domain_ops;
if (dirty_tracking)
domain->domain.dirty_ops = &amd_dirty_ops;
} }
return &domain->domain; return &domain->domain;
...@@ -2208,7 +2224,7 @@ static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev, ...@@ -2208,7 +2224,7 @@ static struct iommu_domain *amd_iommu_domain_alloc_user(struct device *dev,
{ {
unsigned int type = IOMMU_DOMAIN_UNMANAGED; unsigned int type = IOMMU_DOMAIN_UNMANAGED;
if (flags) if (flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
return ERR_PTR(-EOPNOTSUPP); return ERR_PTR(-EOPNOTSUPP);
return do_iommu_domain_alloc(type, dev, flags); return do_iommu_domain_alloc(type, dev, flags);
...@@ -2251,6 +2267,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, ...@@ -2251,6 +2267,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
dev_data->defer_attach = false; dev_data->defer_attach = false;
/*
* Restrict to devices with compatible IOMMU hardware support
* when enforcement of dirty tracking is enabled.
*/
if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
return -EINVAL;
if (dev_data->domain) if (dev_data->domain)
detach_device(dev); detach_device(dev);
...@@ -2369,6 +2392,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) ...@@ -2369,6 +2392,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return true; return true;
case IOMMU_CAP_DEFERRED_FLUSH: case IOMMU_CAP_DEFERRED_FLUSH:
return true; return true;
case IOMMU_CAP_DIRTY_TRACKING: {
struct amd_iommu *iommu = rlookup_amd_iommu(dev);
return amd_iommu_hd_support(iommu);
}
default: default:
break; break;
} }
...@@ -2376,6 +2404,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) ...@@ -2376,6 +2404,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false; return false;
} }
static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
bool enable)
{
struct protection_domain *pdomain = to_pdomain(domain);
struct dev_table_entry *dev_table;
struct iommu_dev_data *dev_data;
bool domain_flush = false;
struct amd_iommu *iommu;
unsigned long flags;
u64 pte_root;
spin_lock_irqsave(&pdomain->lock, flags);
if (!(pdomain->dirty_tracking ^ enable)) {
spin_unlock_irqrestore(&pdomain->lock, flags);
return 0;
}
list_for_each_entry(dev_data, &pdomain->dev_list, list) {
iommu = rlookup_amd_iommu(dev_data->dev);
if (!iommu)
continue;
dev_table = get_dev_table(iommu);
pte_root = dev_table[dev_data->devid].data[0];
pte_root = (enable ? pte_root | DTE_FLAG_HAD :
pte_root & ~DTE_FLAG_HAD);
/* Flush device DTE */
dev_table[dev_data->devid].data[0] = pte_root;
device_flush_dte(dev_data);
domain_flush = true;
}
/* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
if (domain_flush) {
amd_iommu_domain_flush_tlb_pde(pdomain);
amd_iommu_domain_flush_complete(pdomain);
}
pdomain->dirty_tracking = enable;
spin_unlock_irqrestore(&pdomain->lock, flags);
return 0;
}
static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
unsigned long iova, size_t size,
unsigned long flags,
struct iommu_dirty_bitmap *dirty)
{
struct protection_domain *pdomain = to_pdomain(domain);
struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
unsigned long lflags;
if (!ops || !ops->read_and_clear_dirty)
return -EOPNOTSUPP;
spin_lock_irqsave(&pdomain->lock, lflags);
if (!pdomain->dirty_tracking && dirty->bitmap) {
spin_unlock_irqrestore(&pdomain->lock, lflags);
return -EINVAL;
}
spin_unlock_irqrestore(&pdomain->lock, lflags);
return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
}
static void amd_iommu_get_resv_regions(struct device *dev, static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head) struct list_head *head)
{ {
...@@ -2498,6 +2593,11 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) ...@@ -2498,6 +2593,11 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true; return true;
} }
const struct iommu_dirty_ops amd_dirty_ops = {
.set_dirty_tracking = amd_iommu_set_dirty_tracking,
.read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
};
const struct iommu_ops amd_iommu_ops = { const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable, .capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc, .domain_alloc = amd_iommu_domain_alloc,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment