[PATCH v2 1/2] drm/amdkfd: check access permisson to restore retry fault
Felix Kuehling
felix.kuehling at amd.com
Fri Aug 20 22:09:59 UTC 2021
Am 2021-08-20 um 2:31 p.m. schrieb Philip Yang:
> Check range access permission to restore GPU retry fault, if GPU retry
> fault on address which belongs to VMA, and VMA has no read or write
> permission requested by GPU, failed to restore the address. The vm fault
> event will pass back to user space.
>
> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 5 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +-
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 3 ++-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 29 +++++++++++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 5 +++--
> 6 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 831f00644460..bbb892d8f489 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -3347,12 +3347,13 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
> * @adev: amdgpu device pointer
> * @pasid: PASID of the VM
> * @addr: Address of the fault
> + * @write_fault: 0 is read fault, 1 is write fault
That should say true/false instead of 0/1. With that fixed, the series is
Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
> *
> * Try to gracefully handle a VM fault. Return true if the fault was handled and
> * shouldn't be reported any more.
> */
> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> - uint64_t addr)
> + uint64_t addr, bool write_fault)
> {
> bool is_compute_context = false;
> struct amdgpu_bo *root;
> @@ -3377,7 +3378,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> addr /= AMDGPU_GPU_PAGE_SIZE;
>
> if (is_compute_context &&
> - !svm_range_restore_pages(adev, pasid, addr)) {
> + !svm_range_restore_pages(adev, pasid, addr, write_fault)) {
> amdgpu_bo_unref(&root);
> return true;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 80cc9ab2c1d0..85fcfb8c5efd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -448,7 +448,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
> void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
> struct amdgpu_task_info *task_info);
> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> - uint64_t addr);
> + uint64_t addr, bool write_fault);
>
> void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 24b781e90bef..41c3a0d70b7c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -93,6 +93,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
> struct amdgpu_iv_entry *entry)
> {
> bool retry_fault = !!(entry->src_data[1] & 0x80);
> + bool write_fault = !!(entry->src_data[1] & 0x20);
> struct amdgpu_vmhub *hub = &adev->vmhub[entry->vmid_src];
> struct amdgpu_task_info task_info;
> uint32_t status = 0;
> @@ -121,7 +122,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
> /* Try to handle the recoverable page faults by filling page
> * tables
> */
> - if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
> + if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
> return 1;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 097230b5e946..a7dfb370f642 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -506,6 +506,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> struct amdgpu_iv_entry *entry)
> {
> bool retry_fault = !!(entry->src_data[1] & 0x80);
> + bool write_fault = !!(entry->src_data[1] & 0x20);
> uint32_t status = 0, cid = 0, rw = 0;
> struct amdgpu_task_info task_info;
> struct amdgpu_vmhub *hub;
> @@ -536,7 +537,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
> /* Try to handle the recoverable page faults by filling page
> * tables
> */
> - if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
> + if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, write_fault))
> return 1;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index d4a43c94bcf9..b6d1268bc38f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2400,9 +2400,29 @@ svm_range_count_fault(struct amdgpu_device *adev, struct kfd_process *p,
> WRITE_ONCE(pdd->faults, pdd->faults + 1);
> }
>
> +static bool
> +svm_fault_allowed(struct mm_struct *mm, uint64_t addr, bool write_fault)
> +{
> + unsigned long requested = VM_READ;
> + struct vm_area_struct *vma;
> +
> + if (write_fault)
> + requested |= VM_WRITE;
> +
> + vma = find_vma(mm, addr << PAGE_SHIFT);
> + if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
> + pr_debug("address 0x%llx VMA is removed\n", addr);
> + return true;
> + }
> +
> + pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
> + vma->vm_flags);
> + return (vma->vm_flags & requested) == requested;
> +}
> +
> int
> svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> - uint64_t addr)
> + uint64_t addr, bool write_fault)
> {
> struct mm_struct *mm = NULL;
> struct svm_range_list *svms;
> @@ -2484,6 +2504,13 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> goto out_unlock_range;
> }
>
> + if (!svm_fault_allowed(mm, addr, write_fault)) {
> + pr_debug("fault addr 0x%llx no %s permission\n", addr,
> + write_fault ? "write" : "read");
> + r = -EPERM;
> + goto out_unlock_range;
> + }
> +
> best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
> if (best_loc == -1) {
> pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index e7fc5e8998aa..6dc91c33e80f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -175,7 +175,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
> unsigned long addr, struct svm_range *parent,
> struct svm_range *prange);
> int svm_range_restore_pages(struct amdgpu_device *adev,
> - unsigned int pasid, uint64_t addr);
> + unsigned int pasid, uint64_t addr, bool write_fault);
> int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
> void svm_range_add_list_work(struct svm_range_list *svms,
> struct svm_range *prange, struct mm_struct *mm,
> @@ -210,7 +210,8 @@ static inline void svm_range_list_fini(struct kfd_process *p)
> }
>
> static inline int svm_range_restore_pages(struct amdgpu_device *adev,
> - unsigned int pasid, uint64_t addr)
> + unsigned int pasid, uint64_t addr,
> + bool write_fault)
> {
> return -EFAULT;
> }
More information about the amd-gfx
mailing list