You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3683 lines
141 KiB

3 years ago
/*******************************************************************************
2 years ago
Copyright (c) 2016-2023 NVIDIA Corporation
3 years ago
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*******************************************************************************/
#include "uvm_hmm.h"
1 year ago
// Support for HMM ( https://docs.kernel.org/mm/hmm.html ):
#ifdef NVCPU_X86_64
static bool uvm_disable_hmm = false;
MODULE_PARM_DESC(uvm_disable_hmm,
"Force-disable HMM functionality in the UVM driver. "
"Default: false (HMM is enabled if possible). "
"However, even with uvm_disable_hmm=false, HMM will not be "
"enabled if is not supported in this driver build "
"configuration, or if ATS settings conflict with HMM.");
#else
// So far, we've only tested HMM on x86_64, so disable it by default everywhere
// else.
2 years ago
static bool uvm_disable_hmm = true;
3 years ago
MODULE_PARM_DESC(uvm_disable_hmm,
"Force-disable HMM functionality in the UVM driver. "
1 year ago
"Default: true (HMM is not enabled on this CPU architecture). "
"However, even with uvm_disable_hmm=false, HMM will not be "
2 years ago
"enabled if is not supported in this driver build "
"configuration, or if ATS settings conflict with HMM.");
1 year ago
#endif
module_param(uvm_disable_hmm, bool, 0444);
3 years ago
#if UVM_IS_CONFIG_HMM()
#include <linux/hmm.h>
2 years ago
#include <linux/rmap.h>
#include <linux/migrate.h>
3 years ago
#include <linux/userfaultfd_k.h>
2 years ago
#include <linux/memremap.h>
#include <linux/wait.h>
3 years ago
#include "uvm_common.h"
#include "uvm_gpu.h"
2 years ago
#include "uvm_pmm_gpu.h"
#include "uvm_hal_types.h"
1 year ago
#include "uvm_push.h"
#include "uvm_hal.h"
3 years ago
#include "uvm_va_block_types.h"
#include "uvm_va_space_mm.h"
#include "uvm_va_space.h"
#include "uvm_va_range.h"
#include "uvm_range_tree.h"
2 years ago
#include "uvm_pmm_sysmem.h"
3 years ago
#include "uvm_lock.h"
#include "uvm_api.h"
#include "uvm_va_policy.h"
2 years ago
#include "uvm_tools.h"
3 years ago
1 year ago
static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page);
2 years ago
typedef struct
{
uvm_processor_id_t processor_id;
uvm_processor_id_t new_residency;
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_service_block_context_t *service_context;
uvm_page_mask_t page_mask;
uvm_page_mask_t same_devmem_page_mask;
} uvm_hmm_gpu_fault_event_t;
typedef struct
{
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_va_block_context_t *va_block_context;
uvm_va_block_region_t region;
uvm_processor_id_t dest_id;
uvm_make_resident_cause_t cause;
uvm_page_mask_t page_mask;
uvm_page_mask_t same_devmem_page_mask;
} uvm_hmm_migrate_event_t;
typedef struct
{
uvm_processor_id_t processor_id;
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_service_block_context_t *service_context;
uvm_page_mask_t page_mask;
uvm_page_mask_t same_devmem_page_mask;
} uvm_hmm_devmem_fault_context_t;
3 years ago
bool uvm_hmm_is_enabled_system_wide(void)
{
1 year ago
return !uvm_disable_hmm && !g_uvm_global.ats.enabled && uvm_va_space_mm_enabled_system();
3 years ago
}
bool uvm_hmm_is_enabled(uvm_va_space_t *va_space)
{
return uvm_hmm_is_enabled_system_wide() &&
uvm_va_space_mm_enabled(va_space) &&
2 years ago
!(va_space->initialization_flags & UVM_INIT_FLAGS_DISABLE_HMM);
3 years ago
}
static uvm_va_block_t *hmm_va_block_from_node(uvm_range_tree_node_t *node)
{
if (!node)
return NULL;
return container_of(node, uvm_va_block_t, hmm.node);
}
1 year ago
// Copies the contents of the source device-private page to the
// destination CPU page. This will invalidate mappings, so cannot be
// called while holding any va_block locks.
static NV_STATUS uvm_hmm_copy_devmem_page(struct page *dst_page, struct page *src_page, uvm_tracker_t *tracker)
{
uvm_gpu_phys_address_t src_addr;
uvm_gpu_phys_address_t dst_addr;
uvm_gpu_chunk_t *gpu_chunk;
NvU64 dma_addr;
uvm_push_t push;
NV_STATUS status = NV_OK;
uvm_gpu_t *gpu;
// Holding a reference on the device-private page ensures the gpu
// is already retained. This is because when a GPU is unregistered
// all device-private pages are migrated back to the CPU and freed
// before releasing the GPU. Therefore if we could get a reference
// to the page the GPU must be retained.
UVM_ASSERT(is_device_private_page(src_page) && page_count(src_page));
gpu_chunk = uvm_pmm_devmem_page_to_chunk(src_page);
gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
status = uvm_mmu_chunk_map(gpu_chunk);
if (status != NV_OK)
return status;
status = uvm_gpu_map_cpu_pages(gpu->parent, dst_page, PAGE_SIZE, &dma_addr);
if (status != NV_OK)
goto out_unmap_gpu;
dst_addr = uvm_gpu_phys_address(UVM_APERTURE_SYS, dma_addr);
src_addr = uvm_gpu_phys_address(UVM_APERTURE_VID, gpu_chunk->address);
status = uvm_push_begin_acquire(gpu->channel_manager,
UVM_CHANNEL_TYPE_GPU_TO_CPU,
tracker,
&push,
"Copy for remote process fault");
if (status != NV_OK)
goto out_unmap_cpu;
gpu->parent->ce_hal->memcopy(&push,
uvm_gpu_address_copy(gpu, dst_addr),
uvm_gpu_address_copy(gpu, src_addr),
PAGE_SIZE);
uvm_push_end(&push);
status = uvm_tracker_add_push_safe(tracker, &push);
out_unmap_cpu:
uvm_gpu_unmap_cpu_pages(gpu->parent, dma_addr, PAGE_SIZE);
out_unmap_gpu:
uvm_mmu_chunk_unmap(gpu_chunk, NULL);
return status;
}
static NV_STATUS uvm_hmm_pmm_gpu_evict_pfn(unsigned long pfn)
{
unsigned long src_pfn = 0;
unsigned long dst_pfn = 0;
struct page *dst_page;
NV_STATUS status = NV_OK;
int ret;
ret = migrate_device_range(&src_pfn, pfn, 1);
if (ret)
return errno_to_nv_status(ret);
if (src_pfn & MIGRATE_PFN_MIGRATE) {
uvm_tracker_t tracker = UVM_TRACKER_INIT();
dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
if (!dst_page) {
status = NV_ERR_NO_MEMORY;
goto out;
}
lock_page(dst_page);
if (WARN_ON(uvm_hmm_copy_devmem_page(dst_page, migrate_pfn_to_page(src_pfn), &tracker) != NV_OK))
memzero_page(dst_page, 0, PAGE_SIZE);
dst_pfn = migrate_pfn(page_to_pfn(dst_page));
migrate_device_pages(&src_pfn, &dst_pfn, 1);
uvm_tracker_wait_deinit(&tracker);
}
out:
migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
status = NV_ERR_BUSY_RETRY;
return status;
}
1 year ago
void uvm_hmm_va_space_initialize(uvm_va_space_t *va_space)
3 years ago
{
uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
1 year ago
if (!uvm_hmm_is_enabled(va_space))
1 year ago
return;
3 years ago
1 year ago
uvm_range_tree_init(&hmm_va_space->blocks);
uvm_mutex_init(&hmm_va_space->blocks_lock, UVM_LOCK_ORDER_LEAF);
1 year ago
return;
3 years ago
}
void uvm_hmm_va_space_destroy(uvm_va_space_t *va_space)
{
uvm_hmm_va_space_t *hmm_va_space = &va_space->hmm;
uvm_range_tree_node_t *node, *next;
uvm_va_block_t *va_block;
2 years ago
if (!uvm_hmm_is_enabled(va_space))
3 years ago
return;
uvm_assert_rwsem_locked_write(&va_space->lock);
// The blocks_lock is not needed when the va_space lock is held for write.
uvm_range_tree_for_each_safe(node, next, &hmm_va_space->blocks) {
va_block = hmm_va_block_from_node(node);
uvm_range_tree_remove(&hmm_va_space->blocks, node);
mmu_interval_notifier_remove(&va_block->hmm.notifier);
uvm_va_block_kill(va_block);
}
2 years ago
}
static void hmm_va_block_unregister_gpu(uvm_va_block_t *va_block,
uvm_gpu_t *gpu,
struct mm_struct *mm)
{
uvm_va_policy_node_t *node;
uvm_mutex_lock(&va_block->lock);
// Reset preferred location and accessed-by of policy nodes if needed.
uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
if (uvm_id_equal(node->policy.preferred_location, gpu->id))
node->policy.preferred_location = UVM_ID_INVALID;
uvm_processor_mask_clear(&node->policy.accessed_by, gpu->id);
}
// Migrate and free any remaining resident allocations on this GPU.
uvm_va_block_unregister_gpu_locked(va_block, gpu, mm);
uvm_mutex_unlock(&va_block->lock);
}
void uvm_hmm_unregister_gpu(uvm_va_space_t *va_space, uvm_gpu_t *gpu, struct mm_struct *mm)
{
uvm_range_tree_node_t *node;
uvm_va_block_t *va_block;
1 year ago
struct range range = gpu->pmm.devmem.pagemap.range;
unsigned long pfn;
bool retry;
2 years ago
if (!uvm_hmm_is_enabled(va_space))
return;
if (mm)
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
1 year ago
// There could be pages with page->zone_device_data pointing to the va_space
// which may be about to be freed. Migrate those back to the CPU so we don't
// fault on them. Normally infinite retries are bad, but we don't have any
// option here. Device-private pages can't be pinned so migration should
// eventually succeed. Even if we did eventually bail out of the loop we'd
// just stall in memunmap_pages() anyway.
do {
retry = false;
for (pfn = __phys_to_pfn(range.start); pfn <= __phys_to_pfn(range.end); pfn++) {
struct page *page = pfn_to_page(pfn);
UVM_ASSERT(is_device_private_page(page));
// This check is racy because nothing stops the page being freed and
// even reused. That doesn't matter though - worst case the
// migration fails, we retry and find the va_space doesn't match.
if (page->zone_device_data == va_space)
if (uvm_hmm_pmm_gpu_evict_pfn(pfn) != NV_OK)
retry = true;
}
} while (retry);
2 years ago
uvm_range_tree_for_each(node, &va_space->hmm.blocks) {
va_block = hmm_va_block_from_node(node);
hmm_va_block_unregister_gpu(va_block, gpu, mm);
}
}
static void hmm_va_block_remove_gpu_va_space(uvm_va_block_t *va_block,
uvm_gpu_va_space_t *gpu_va_space,
uvm_va_block_context_t *va_block_context)
{
uvm_mutex_lock(&va_block->lock);
uvm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
3 years ago
2 years ago
uvm_mutex_unlock(&va_block->lock);
// TODO: Bug 3660922: Need to handle read duplication at some point.
// See range_remove_gpu_va_space_managed().
}
void uvm_hmm_remove_gpu_va_space(uvm_va_space_t *va_space,
uvm_gpu_va_space_t *gpu_va_space,
struct mm_struct *mm)
{
uvm_va_block_context_t *va_block_context;
uvm_range_tree_node_t *node, *next;
uvm_va_block_t *va_block;
if (!uvm_hmm_is_enabled(va_space))
return;
if (mm)
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
va_block_context = uvm_va_space_block_context(va_space, mm);
uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
va_block = hmm_va_block_from_node(node);
hmm_va_block_remove_gpu_va_space(va_block, gpu_va_space, va_block_context);
}
3 years ago
}
static bool hmm_invalidate(uvm_va_block_t *va_block,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
2 years ago
uvm_thread_context_t *uvm_context = uvm_thread_context();
3 years ago
struct mmu_interval_notifier *mni = &va_block->hmm.notifier;
2 years ago
struct mm_struct *mm = mni->mm;
uvm_va_block_context_t *va_block_context;
uvm_va_block_region_t region;
3 years ago
NvU64 start, end;
2 years ago
uvm_processor_id_t id;
NV_STATUS status = NV_OK;
3 years ago
// The MMU_NOTIFY_RELEASE event isn't really needed since mn_itree_release()
// doesn't remove the interval notifiers from the struct_mm so there will
// be a full range MMU_NOTIFY_UNMAP event after the release from
// unmap_vmas() during exit_mmap().
if (range->event == MMU_NOTIFY_SOFT_DIRTY || range->event == MMU_NOTIFY_RELEASE)
return true;
// Blockable is only set false by
// mmu_notifier_invalidate_range_start_nonblock() which is only called in
// __oom_reap_task_mm().
if (!mmu_notifier_range_blockable(range))
return false;
1 year ago
// We only ignore invalidations in this context whilst holding the
// va_block lock. This prevents deadlock when try_to_migrate()
// calls the notifier, but holding the lock prevents other threads
// invalidating PTEs so we can safely assume the results of
// migrate_vma_setup() are correct.
if (uvm_context->ignore_hmm_invalidate_va_block == va_block ||
((range->event == MMU_NOTIFY_MIGRATE || range->event == MMU_NOTIFY_EXCLUSIVE) &&
range->owner == &g_uvm_global))
2 years ago
return true;
va_block_context = uvm_va_block_context_alloc(mm);
if (!va_block_context)
3 years ago
return true;
uvm_mutex_lock(&va_block->lock);
1 year ago
// mmu_interval_notifier_remove() is always called before marking a
// va_block as dead so this va_block has to be alive.
UVM_ASSERT(!uvm_va_block_is_dead(va_block));
3 years ago
// Note: unmap_vmas() does MMU_NOTIFY_UNMAP [0, 0xffffffffffffffff]
2 years ago
// Also note that hmm_invalidate() can be called when a new va_block is not
// yet inserted into the va_space->hmm.blocks table while the original
// va_block is being split. The original va_block may have its end address
// updated before the mmu interval notifier is updated so this invalidate
// may be for a range past the va_block end address.
3 years ago
start = range->start;
end = (range->end == ULONG_MAX) ? range->end : range->end - 1;
if (start < va_block->start)
start = va_block->start;
if (end > va_block->end)
end = va_block->end;
2 years ago
if (start > end)
goto unlock;
3 years ago
1 year ago
// These will be equal if no other thread causes an invalidation
// whilst the va_block lock was dropped.
uvm_context->hmm_invalidate_seqnum++;
va_block->hmm.changed++;
2 years ago
mmu_interval_set_seq(mni, cur_seq);
region = uvm_va_block_region_from_start_end(va_block, start, end);
va_block_context->hmm.vma = NULL;
// We only need to unmap GPUs since Linux handles the CPUs.
for_each_gpu_id_in_mask(id, &va_block->mapped) {
status = uvm_va_block_unmap(va_block,
va_block_context,
id,
region,
uvm_va_block_map_mask_get(va_block, id),
&va_block->tracker);
// Note that the va_block lock can be dropped, relocked, and
// NV_ERR_MORE_PROCESSING_REQUIRED returned.
if (status != NV_OK)
break;
}
if (range->event == MMU_NOTIFY_UNMAP || range->event == MMU_NOTIFY_CLEAR)
uvm_va_block_munmap_region(va_block, region);
if (status == NV_OK)
status = uvm_tracker_wait(&va_block->tracker);
// Remove stale HMM struct page pointers to system memory.
uvm_va_block_remove_cpu_chunks(va_block, region);
3 years ago
unlock:
uvm_mutex_unlock(&va_block->lock);
2 years ago
uvm_va_block_context_free(va_block_context);
UVM_ASSERT(status == NV_OK);
3 years ago
return true;
}
static bool uvm_hmm_invalidate_entry(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
uvm_va_block_t *va_block = container_of(mni, uvm_va_block_t, hmm.notifier);
UVM_ENTRY_RET(hmm_invalidate(va_block, range, cur_seq));
}
static const struct mmu_interval_notifier_ops uvm_hmm_notifier_ops =
{
.invalidate = uvm_hmm_invalidate_entry,
};
NV_STATUS uvm_hmm_va_block_find(uvm_va_space_t *va_space,
NvU64 addr,
uvm_va_block_t **va_block_ptr)
{
uvm_range_tree_node_t *node;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
uvm_assert_rwsem_locked(&va_space->lock);
uvm_mutex_lock(&va_space->hmm.blocks_lock);
node = uvm_range_tree_find(&va_space->hmm.blocks, addr);
uvm_mutex_unlock(&va_space->hmm.blocks_lock);
if (!node)
return NV_ERR_OBJECT_NOT_FOUND;
*va_block_ptr = hmm_va_block_from_node(node);
return NV_OK;
}
1 year ago
static int migrate_vma_setup_locked(struct migrate_vma *args, uvm_va_block_t *va_block)
2 years ago
{
uvm_thread_context_t *uvm_context = uvm_thread_context();
int ret;
1 year ago
// It's only safe to ignore invalidations whilst doing a migration
// and holding the va_block lock.
uvm_assert_mutex_locked(&va_block->lock);
2 years ago
uvm_context->ignore_hmm_invalidate_va_block = va_block;
ret = migrate_vma_setup(args);
1 year ago
// We shouldn't be generating any more invalidations now.
2 years ago
uvm_context->ignore_hmm_invalidate_va_block = NULL;
1 year ago
return ret;
2 years ago
}
3 years ago
static bool uvm_hmm_vma_is_valid(struct vm_area_struct *vma,
unsigned long addr,
bool allow_unreadable_vma)
{
// UVM doesn't support userfaultfd. hmm_range_fault() doesn't support
1 year ago
// VM_IO or VM_PFNMAP VMAs. It also doesn't support VMAs without VM_READ
// but we allow those VMAs to have policy set on them.
// migrate_vma_setup() doesn't support VM_SPECIAL VMAs but that is handled
// by uvm_hmm_must_use_sysmem() forcing residency to the CPU.
3 years ago
return vma &&
addr >= vma->vm_start &&
!userfaultfd_armed(vma) &&
1 year ago
!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
!uvm_vma_is_managed(vma) &&
3 years ago
(allow_unreadable_vma || (vma->vm_flags & VM_READ));
}
1 year ago
static void hmm_va_block_init(uvm_va_block_t *va_block,
uvm_va_space_t *va_space,
NvU64 start,
NvU64 end)
{
va_block->hmm.va_space = va_space;
va_block->hmm.node.start = start;
va_block->hmm.node.end = end;
uvm_range_tree_init(&va_block->hmm.va_policy_tree);
uvm_mutex_init(&va_block->hmm.migrate_lock, UVM_LOCK_ORDER_VA_BLOCK_MIGRATE);
}
3 years ago
static NV_STATUS hmm_va_block_find_create(uvm_va_space_t *va_space,
NvU64 addr,
bool allow_unreadable_vma,
1 year ago
struct vm_area_struct **vma_out,
3 years ago
uvm_va_block_t **va_block_ptr)
{
1 year ago
struct mm_struct *mm;
struct vm_area_struct *va_block_vma;
3 years ago
uvm_va_block_t *va_block;
NvU64 start, end;
NV_STATUS status;
int ret;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
1 year ago
mm = va_space->va_space_mm.mm;
3 years ago
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked(&va_space->lock);
UVM_ASSERT(PAGE_ALIGNED(addr));
// Note that we have to allow PROT_NONE VMAs so that policies can be set.
1 year ago
va_block_vma = find_vma(mm, addr);
if (!uvm_hmm_vma_is_valid(va_block_vma, addr, allow_unreadable_vma))
3 years ago
return NV_ERR_INVALID_ADDRESS;
// Since we only hold the va_space read lock, there can be multiple
// parallel va_block insertions.
uvm_mutex_lock(&va_space->hmm.blocks_lock);
va_block = hmm_va_block_from_node(uvm_range_tree_find(&va_space->hmm.blocks, addr));
if (va_block)
goto done;
// The va_block is always created to cover the whole aligned
// UVM_VA_BLOCK_SIZE interval unless there are existing UVM va_ranges or
// HMM va_blocks. In that case, the new HMM va_block size is adjusted so it
// doesn't overlap.
start = UVM_VA_BLOCK_ALIGN_DOWN(addr);
end = start + UVM_VA_BLOCK_SIZE - 1;
// Search for existing UVM va_ranges in the start/end interval and create
// a maximum interval that doesn't overlap any existing UVM va_ranges.
// We know that 'addr' is not within a va_range or
// hmm_va_block_find_create() wouldn't be called.
2 years ago
status = uvm_range_tree_find_hole_in(&va_space->va_range_tree, addr, &start, &end);
UVM_ASSERT(status == NV_OK);
3 years ago
// Search for existing HMM va_blocks in the start/end interval and create
// a maximum interval that doesn't overlap any existing HMM va_blocks.
2 years ago
status = uvm_range_tree_find_hole_in(&va_space->hmm.blocks, addr, &start, &end);
UVM_ASSERT(status == NV_OK);
3 years ago
// Create a HMM va_block with a NULL va_range pointer.
status = uvm_va_block_create(NULL, start, end, &va_block);
if (status != NV_OK)
goto err_unlock;
1 year ago
hmm_va_block_init(va_block, va_space, start, end);
3 years ago
ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
mm,
start,
end - start + 1,
&uvm_hmm_notifier_ops);
if (ret) {
status = errno_to_nv_status(ret);
goto err_release;
}
status = uvm_range_tree_add(&va_space->hmm.blocks, &va_block->hmm.node);
2 years ago
UVM_ASSERT(status == NV_OK);
3 years ago
done:
uvm_mutex_unlock(&va_space->hmm.blocks_lock);
1 year ago
if (vma_out)
*vma_out = va_block_vma;
3 years ago
*va_block_ptr = va_block;
return NV_OK;
err_release:
uvm_va_block_release(va_block);
err_unlock:
uvm_mutex_unlock(&va_space->hmm.blocks_lock);
return status;
}
NV_STATUS uvm_hmm_va_block_find_create(uvm_va_space_t *va_space,
NvU64 addr,
1 year ago
struct vm_area_struct **vma,
3 years ago
uvm_va_block_t **va_block_ptr)
{
1 year ago
return hmm_va_block_find_create(va_space, addr, false, vma, va_block_ptr);
3 years ago
}
1 year ago
NV_STATUS uvm_hmm_find_vma(struct mm_struct *mm, struct vm_area_struct **vma_out, NvU64 addr)
2 years ago
{
if (!mm)
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(mm);
1 year ago
*vma_out = find_vma(mm, addr);
if (!uvm_hmm_vma_is_valid(*vma_out, addr, false))
2 years ago
return NV_ERR_INVALID_ADDRESS;
return NV_OK;
}
2 years ago
bool uvm_hmm_check_context_vma_is_valid(uvm_va_block_t *va_block,
1 year ago
struct vm_area_struct *vma,
2 years ago
uvm_va_block_region_t region)
2 years ago
{
uvm_assert_mutex_locked(&va_block->lock);
if (uvm_va_block_is_hmm(va_block)) {
UVM_ASSERT(vma);
1 year ago
UVM_ASSERT(va_block->hmm.va_space->va_space_mm.mm == vma->vm_mm);
uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
2 years ago
UVM_ASSERT(vma->vm_start <= uvm_va_block_region_start(va_block, region));
UVM_ASSERT(vma->vm_end > uvm_va_block_region_end(va_block, region));
}
return true;
}
1 year ago
void uvm_hmm_service_context_init(uvm_service_block_context_t *service_context)
{
// TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
1 year ago
service_context->block_context->hmm.swap_cached = false;
1 year ago
}
NV_STATUS uvm_hmm_migrate_begin(uvm_va_block_t *va_block)
{
if (uvm_mutex_trylock(&va_block->hmm.migrate_lock))
return NV_OK;
return NV_ERR_BUSY_RETRY;
}
void uvm_hmm_migrate_begin_wait(uvm_va_block_t *va_block)
{
uvm_mutex_lock(&va_block->hmm.migrate_lock);
}
void uvm_hmm_migrate_finish(uvm_va_block_t *va_block)
{
uvm_mutex_unlock(&va_block->hmm.migrate_lock);
}
2 years ago
// Migrate the given range [start end] within a va_block to dest_id.
static NV_STATUS hmm_migrate_range(uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t dest_id,
NvU64 start,
NvU64 end,
uvm_migrate_mode_t mode,
uvm_tracker_t *out_tracker)
{
uvm_va_block_region_t region;
uvm_va_policy_node_t *node;
const uvm_va_policy_t *policy;
NV_STATUS status = NV_OK;
1 year ago
uvm_hmm_migrate_begin_wait(va_block);
2 years ago
uvm_mutex_lock(&va_block->lock);
uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
// Even though UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock the
// va_block lock, the policy remains valid because we hold the mmap
// lock so munmap can't remove the policy, and the va_space lock so the
// policy APIs can't change the policy.
status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
va_block_retry,
uvm_va_block_migrate_locked(va_block,
va_block_retry,
va_block_context,
region,
dest_id,
mode,
out_tracker));
if (status != NV_OK)
break;
}
uvm_mutex_unlock(&va_block->lock);
1 year ago
uvm_hmm_migrate_finish(va_block);
2 years ago
return status;
}
2 years ago
NV_STATUS uvm_hmm_test_va_block_inject_split_error(uvm_va_space_t *va_space, NvU64 addr)
{
uvm_va_block_test_t *block_test;
uvm_va_block_t *va_block;
NV_STATUS status;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
status = hmm_va_block_find_create(va_space, addr, false, NULL, &va_block);
if (status != NV_OK)
return status;
block_test = uvm_va_block_get_test(va_block);
if (block_test)
block_test->inject_split_error = true;
return NV_OK;
}
3 years ago
typedef struct {
struct mmu_interval_notifier notifier;
uvm_va_block_t *existing_block;
} hmm_split_invalidate_data_t;
static bool hmm_split_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
hmm_split_invalidate_data_t *split_data = container_of(mni, hmm_split_invalidate_data_t, notifier);
2 years ago
uvm_tools_test_hmm_split_invalidate(split_data->existing_block->hmm.va_space);
hmm_invalidate(split_data->existing_block, range, cur_seq);
3 years ago
return true;
}
static bool hmm_split_invalidate_entry(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq)
{
UVM_ENTRY_RET(hmm_split_invalidate(mni, range, cur_seq));
}
static const struct mmu_interval_notifier_ops hmm_notifier_split_ops =
{
.invalidate = hmm_split_invalidate_entry,
};
// Splits existing va_block into two pieces, with new_va_block always after
// va_block. va_block is updated to have new_end. new_end+1 must be page-
// aligned.
//
// Before: [----------- existing ------------]
// After: [---- existing ----][---- new ----]
// ^new_end
//
// On error, va_block is still accessible and is left in its original
// functional state.
static NV_STATUS hmm_split_block(uvm_va_block_t *va_block,
NvU64 new_end,
uvm_va_block_t **new_block_ptr)
{
uvm_va_space_t *va_space = va_block->hmm.va_space;
struct mm_struct *mm = va_space->va_space_mm.mm;
hmm_split_invalidate_data_t split_data;
2 years ago
NvU64 delay_us;
3 years ago
uvm_va_block_t *new_va_block;
NV_STATUS status;
int ret;
uvm_assert_rwsem_locked_write(&va_space->lock);
UVM_ASSERT(new_end > va_block->start);
UVM_ASSERT(new_end < va_block->end);
UVM_ASSERT(PAGE_ALIGNED(new_end + 1));
status = uvm_va_block_create(NULL, new_end + 1, va_block->end, &new_va_block);
if (status != NV_OK)
return status;
// Initialize the newly created HMM va_block.
1 year ago
hmm_va_block_init(new_va_block, va_space, new_va_block->start, new_va_block->end);
3 years ago
2 years ago
ret = mmu_interval_notifier_insert(&new_va_block->hmm.notifier,
3 years ago
mm,
2 years ago
new_va_block->start,
uvm_va_block_size(new_va_block),
&uvm_hmm_notifier_ops);
// Since __mmu_notifier_register() was called when the va_space was
// initially created, we know that mm->notifier_subscriptions is valid
// and mmu_interval_notifier_insert() can't return ENOMEM.
// The only error return is for start + length overflowing but we already
// registered the same address range before so there should be no error.
UVM_ASSERT(!ret);
3 years ago
uvm_mutex_lock(&va_block->lock);
status = uvm_va_block_split_locked(va_block, new_end, new_va_block, NULL);
if (status != NV_OK)
goto err;
uvm_mutex_unlock(&va_block->lock);
2 years ago
// The MMU interval notifier has to be removed in order to resize it.
// That means there would be a window of time when invalidation callbacks
// could be missed. To handle this case, we register a temporary notifier
// to cover the address range while resizing the old notifier (it is
// OK to have multiple notifiers for the same range, we may simply try to
// invalidate twice).
split_data.existing_block = va_block;
ret = mmu_interval_notifier_insert(&split_data.notifier,
mm,
va_block->start,
new_end - va_block->start + 1,
&hmm_notifier_split_ops);
3 years ago
UVM_ASSERT(!ret);
2 years ago
// Delay to allow hmm_sanity test to trigger an mmu_notifier during the
// critical window where the split invalidate callback is active.
delay_us = atomic64_read(&va_space->test.split_invalidate_delay_us);
if (delay_us)
udelay(delay_us);
3 years ago
2 years ago
mmu_interval_notifier_remove(&va_block->hmm.notifier);
3 years ago
// Enable notifications on the old block with the smaller size.
ret = mmu_interval_notifier_insert(&va_block->hmm.notifier,
mm,
va_block->start,
2 years ago
uvm_va_block_size(va_block),
3 years ago
&uvm_hmm_notifier_ops);
UVM_ASSERT(!ret);
mmu_interval_notifier_remove(&split_data.notifier);
if (new_block_ptr)
*new_block_ptr = new_va_block;
return status;
err:
uvm_mutex_unlock(&va_block->lock);
2 years ago
mmu_interval_notifier_remove(&new_va_block->hmm.notifier);
3 years ago
uvm_va_block_release(new_va_block);
return status;
}
// Check to see if the HMM va_block would overlap the range start/end and
// split it so it can be removed. That breaks down to the following cases:
// start/end could cover all of the HMM va_block ->
// remove the va_block
// start/end could cover the left part of the HMM va_block ->
// remove the left part
// start/end could cover the right part of the HMM va_block ->
// remove the right part
// or start/end could "punch a hole" in the middle and leave the ends intact.
// In each case, only one HMM va_block is removed so return it in out_va_block.
static NV_STATUS split_block_if_needed(uvm_va_block_t *va_block,
NvU64 start,
NvU64 end,
uvm_va_block_t **out_va_block)
{
2 years ago
uvm_va_block_context_t *va_block_context;
uvm_va_space_t *va_space;
struct mm_struct *mm;
struct vm_area_struct *vma;
uvm_va_block_region_t region;
NvU64 addr, from, to;
3 years ago
uvm_va_block_t *new;
NV_STATUS status;
if (va_block->start < start) {
status = hmm_split_block(va_block, start - 1, &new);
if (status != NV_OK)
return status;
// Keep the left part, the right part will be deleted.
va_block = new;
}
if (va_block->end > end) {
status = hmm_split_block(va_block, end, NULL);
if (status != NV_OK)
return status;
// Keep the right part, the left part will be deleted.
}
*out_va_block = va_block;
2 years ago
// Migrate any GPU data to sysmem before destroying the HMM va_block.
// We do this because the new va_range might be for a UVM external
// allocation which could be converting an address range that was first
// operated on by UVM-HMM and the exteral allocation should see that data.
va_space = va_block->hmm.va_space;
mm = va_space->va_space_mm.mm;
va_block_context = uvm_va_space_block_context(va_space, mm);
for (addr = va_block->start; addr < va_block->end; addr = to + 1) {
vma = find_vma_intersection(mm, addr, va_block->end);
if (!vma)
break;
from = max(addr, (NvU64)vma->vm_start);
to = min(va_block->end, (NvU64)vma->vm_end - 1);
region = uvm_va_block_region_from_start_end(va_block, from, to);
if (!uvm_hmm_vma_is_valid(vma, from, false))
continue;
va_block_context->hmm.vma = vma;
status = hmm_migrate_range(va_block,
NULL,
va_block_context,
UVM_ID_CPU,
from,
to,
UVM_MIGRATE_MODE_MAKE_RESIDENT_AND_MAP,
NULL);
if (status != NV_OK)
return status;
}
3 years ago
return NV_OK;
}
// Normally, the HMM va_block is destroyed when the va_space is destroyed
// (i.e., when the /dev/nvidia-uvm device is closed). A munmap() call triggers
// a uvm_hmm_invalidate() callback which unmaps the VMA's range from the GPU's
// page tables. However, it doesn't destroy the va_block because that would
// require calling mmu_interval_notifier_remove() which can't be called from
// the invalidate callback due to Linux locking constraints. If a process
2 years ago
// calls mmap()/munmap() for SAM and then creates a managed allocation,
3 years ago
// the same VMA range can be picked and there would be a UVM/HMM va_block
2 years ago
// conflict. Creating a managed allocation, external allocation, or other
// va_range types, calls this function to remove stale HMM va_blocks or split
// the HMM va_block so there is no overlap.
3 years ago
NV_STATUS uvm_hmm_va_block_reclaim(uvm_va_space_t *va_space,
struct mm_struct *mm,
NvU64 start,
NvU64 end)
{
uvm_range_tree_node_t *node, *next;
uvm_va_block_t *va_block;
NV_STATUS status;
if (!uvm_hmm_is_enabled(va_space))
return NV_OK;
2 years ago
if (mm)
3 years ago
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
// Process each HMM va_block that overlaps the interval [start, end].
// Note that end is inclusive.
// The blocks_lock is not needed when the va_space lock is held for write.
uvm_range_tree_for_each_in_safe(node, next, &va_space->hmm.blocks, start, end) {
va_block = hmm_va_block_from_node(node);
if (mm) {
status = split_block_if_needed(va_block, start, end, &va_block);
if (status != NV_OK)
return status;
}
// Note that this waits for any invalidations callbacks to complete
// so uvm_hmm_invalidate() won't see a block disapear.
// The va_space write lock should prevent uvm_hmm_va_block_find_create()
// from adding it back.
mmu_interval_notifier_remove(&va_block->hmm.notifier);
uvm_range_tree_remove(&va_space->hmm.blocks, &va_block->hmm.node);
uvm_va_block_kill(va_block);
}
UVM_ASSERT(!uvm_range_tree_iter_first(&va_space->hmm.blocks, start, end));
return NV_OK;
}
2 years ago
void uvm_hmm_va_block_split_tree(uvm_va_block_t *existing_va_block, uvm_va_block_t *new_block)
{
uvm_va_space_t *va_space = existing_va_block->hmm.va_space;
UVM_ASSERT(uvm_va_block_is_hmm(existing_va_block));
uvm_assert_rwsem_locked_write(&va_space->lock);
uvm_range_tree_split(&existing_va_block->hmm.va_space->hmm.blocks,
&existing_va_block->hmm.node,
&new_block->hmm.node);
}
3 years ago
NV_STATUS uvm_hmm_split_as_needed(uvm_va_space_t *va_space,
NvU64 addr,
uvm_va_policy_is_split_needed_t split_needed_cb,
void *data)
{
uvm_va_block_t *va_block;
uvm_va_policy_node_t *node;
NV_STATUS status;
uvm_assert_rwsem_locked_write(&va_space->lock);
// If there is no HMM va_block or the va_block doesn't span the policy
// addr, there is no need to split.
status = uvm_hmm_va_block_find(va_space, addr, &va_block);
if (status != NV_OK || va_block->start == addr)
return NV_OK;
uvm_mutex_lock(&va_block->lock);
node = uvm_va_policy_node_find(va_block, addr);
if (!node)
goto done;
// If the policy range doesn't span addr, we're done.
if (addr == node->node.start)
goto done;
if (split_needed_cb(&node->policy, data))
status = uvm_va_policy_node_split(va_block, node, addr - 1, NULL);
done:
uvm_mutex_unlock(&va_block->lock);
return status;
}
2 years ago
static NV_STATUS hmm_set_preferred_location_locked(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t preferred_location,
NvU64 addr,
NvU64 end,
uvm_tracker_t *out_tracker)
{
uvm_processor_mask_t set_accessed_by_processors;
const uvm_va_policy_t *old_policy;
uvm_va_policy_node_t *node;
uvm_va_block_region_t region;
uvm_processor_id_t id;
NV_STATUS status, tracker_status;
// Note that we can't just call uvm_va_policy_set_range() for the whole
// range [addr end] because we need to examine the old value of
// policy->preferred_location before setting it. Thus we iterate over
// the existing policy nodes.
uvm_for_each_va_policy_in(old_policy, va_block, addr, end, node, region) {
if (uvm_id_equal(old_policy->preferred_location, preferred_location))
continue;
// If the old preferred location is a valid processor ID, remote
// mappings should be established to the new preferred location if
// accessed-by is set.
uvm_processor_mask_zero(&set_accessed_by_processors);
if (UVM_ID_IS_VALID(old_policy->preferred_location) &&
uvm_processor_mask_test(&old_policy->accessed_by, old_policy->preferred_location))
uvm_processor_mask_set(&set_accessed_by_processors, old_policy->preferred_location);
1 year ago
if (!uvm_va_policy_set_preferred_location(va_block, region, preferred_location, old_policy))
2 years ago
return NV_ERR_NO_MEMORY;
// Establish new remote mappings if the old preferred location had
// accessed-by set.
for_each_id_in_mask(id, &set_accessed_by_processors) {
status = uvm_va_block_set_accessed_by_locked(va_block, va_block_context, id, region, out_tracker);
if (status != NV_OK)
return status;
}
// Even though the UVM_VA_BLOCK_RETRY_LOCKED() may unlock and relock
// the va_block lock, the policy remains valid because we hold the mmap
// lock so munmap can't remove the policy, and the va_space lock so the
// policy APIs can't change the policy.
status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
NULL,
uvm_va_block_set_preferred_location_locked(va_block,
va_block_context,
region));
tracker_status = uvm_tracker_add_tracker_safe(out_tracker, &va_block->tracker);
if (status == NV_OK)
status = tracker_status;
if (status != NV_OK)
return status;
}
return NV_OK;
}
3 years ago
NV_STATUS uvm_hmm_set_preferred_location(uvm_va_space_t *va_space,
uvm_processor_id_t preferred_location,
NvU64 base,
2 years ago
NvU64 last_address,
uvm_tracker_t *out_tracker)
3 years ago
{
2 years ago
uvm_va_block_context_t *va_block_context;
3 years ago
uvm_va_block_t *va_block;
NvU64 addr;
NV_STATUS status = NV_OK;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
UVM_ASSERT(PAGE_ALIGNED(base));
UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
UVM_ASSERT(base < last_address);
// Update HMM preferred location policy.
2 years ago
va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
3 years ago
for (addr = base; addr < last_address; addr = va_block->end + 1) {
NvU64 end;
1 year ago
status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
3 years ago
if (status != NV_OK)
break;
end = min(last_address, va_block->end);
uvm_mutex_lock(&va_block->lock);
2 years ago
status = hmm_set_preferred_location_locked(va_block,
va_block_context,
preferred_location,
addr,
end,
out_tracker);
3 years ago
uvm_mutex_unlock(&va_block->lock);
if (status != NV_OK)
break;
}
return status;
}
2 years ago
static NV_STATUS hmm_set_accessed_by_start_end_locked(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t processor_id,
NvU64 start,
NvU64 end,
uvm_tracker_t *out_tracker)
{
uvm_va_space_t *va_space = va_block->hmm.va_space;
uvm_va_policy_node_t *node;
uvm_va_block_region_t region;
NV_STATUS status = NV_OK;
uvm_for_each_va_policy_node_in(node, va_block, start, end) {
// Read duplication takes precedence over SetAccessedBy.
// Do not add mappings if read duplication is enabled.
if (uvm_va_policy_is_read_duplicate(&node->policy, va_space))
continue;
region = uvm_va_block_region_from_start_end(va_block,
max(start, node->node.start),
min(end, node->node.end));
status = uvm_va_block_set_accessed_by_locked(va_block,
va_block_context,
processor_id,
region,
out_tracker);
if (status != NV_OK)
break;
}
return status;
}
3 years ago
NV_STATUS uvm_hmm_set_accessed_by(uvm_va_space_t *va_space,
uvm_processor_id_t processor_id,
bool set_bit,
NvU64 base,
2 years ago
NvU64 last_address,
uvm_tracker_t *out_tracker)
3 years ago
{
2 years ago
uvm_va_block_context_t *va_block_context;
3 years ago
uvm_va_block_t *va_block;
NvU64 addr;
NV_STATUS status = NV_OK;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
uvm_assert_rwsem_locked_write(&va_space->lock);
UVM_ASSERT(PAGE_ALIGNED(base));
UVM_ASSERT(PAGE_ALIGNED(last_address + 1));
UVM_ASSERT(base < last_address);
// Update HMM accessed by policy.
2 years ago
va_block_context = uvm_va_space_block_context(va_space, va_space->va_space_mm.mm);
3 years ago
for (addr = base; addr < last_address; addr = va_block->end + 1) {
NvU64 end;
1 year ago
status = hmm_va_block_find_create(va_space, addr, true, &va_block_context->hmm.vma, &va_block);
3 years ago
if (status != NV_OK)
break;
end = min(last_address, va_block->end);
uvm_mutex_lock(&va_block->lock);
status = uvm_va_policy_set_range(va_block,
addr,
end,
UVM_VA_POLICY_ACCESSED_BY,
!set_bit,
processor_id,
UVM_READ_DUPLICATION_MAX);
2 years ago
if (status == NV_OK && set_bit) {
status = hmm_set_accessed_by_start_end_locked(va_block,
va_block_context,
processor_id,
addr,
end,
out_tracker);
}
3 years ago
uvm_mutex_unlock(&va_block->lock);
if (status != NV_OK)
break;
}
return status;
}
2 years ago
void uvm_hmm_block_add_eviction_mappings(uvm_va_space_t *va_space,
uvm_va_block_t *va_block,
uvm_va_block_context_t *block_context)
{
uvm_tracker_t local_tracker = UVM_TRACKER_INIT();
uvm_va_policy_node_t *node;
uvm_va_block_region_t region;
uvm_processor_mask_t map_processors;
uvm_processor_id_t id;
NV_STATUS tracker_status;
NV_STATUS status = NV_OK;
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
uvm_assert_mmap_lock_locked(va_space->va_space_mm.mm);
uvm_assert_rwsem_locked(&va_space->lock);
uvm_mutex_lock(&va_block->lock);
uvm_for_each_va_policy_node_in(node, va_block, va_block->start, va_block->end) {
for_each_id_in_mask(id, &node->policy.accessed_by) {
status = hmm_set_accessed_by_start_end_locked(va_block,
block_context,
id,
node->node.start,
node->node.end,
&local_tracker);
if (status != NV_OK)
break;
if (!uvm_va_space_map_remote_on_eviction(va_space))
continue;
// Exclude the processors that have been already mapped due to
// AccessedBy.
uvm_processor_mask_andnot(&map_processors, &va_block->evicted_gpus, &node->policy.accessed_by);
for_each_gpu_id_in_mask(id, &map_processors) {
uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_space, id);
uvm_va_block_gpu_state_t *gpu_state;
if (!gpu->parent->access_counters_supported)
continue;
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
UVM_ASSERT(gpu_state);
// TODO: Bug 2096389: uvm_va_block_add_mappings does not add
// remote mappings to read-duplicated pages. Add support for it
// or create a new function.
status = uvm_va_block_add_mappings(va_block,
block_context,
id,
region,
&gpu_state->evicted,
UvmEventMapRemoteCauseEviction);
tracker_status = uvm_tracker_add_tracker_safe(&local_tracker, &va_block->tracker);
status = (status == NV_OK) ? tracker_status : status;
if (status != NV_OK) {
UVM_ASSERT(status != NV_ERR_MORE_PROCESSING_REQUIRED);
break;
}
}
}
}
uvm_mutex_unlock(&va_block->lock);
tracker_status = uvm_tracker_wait_deinit(&local_tracker);
status = (status == NV_OK) ? tracker_status : status;
if (status != NV_OK) {
UVM_ERR_PRINT("Deferred mappings to evicted memory for block [0x%llx, 0x%llx] failed %s\n",
va_block->start,
va_block->end,
nvstatusToString(status));
}
}
1 year ago
const uvm_va_policy_t *uvm_hmm_find_policy_end(uvm_va_block_t *va_block,
struct vm_area_struct *vma,
unsigned long addr,
NvU64 *endp)
3 years ago
{
2 years ago
const uvm_va_policy_node_t *node;
1 year ago
const uvm_va_policy_t *policy;
2 years ago
NvU64 end = va_block->end;
3 years ago
uvm_assert_mmap_lock_locked(vma->vm_mm);
uvm_assert_mutex_locked(&va_block->lock);
if (end > vma->vm_end - 1)
end = vma->vm_end - 1;
node = uvm_va_policy_node_find(va_block, addr);
if (node) {
1 year ago
policy = &node->policy;
3 years ago
if (end > node->node.end)
end = node->node.end;
}
2 years ago
else {
1 year ago
policy = &uvm_va_policy_default;
2 years ago
}
3 years ago
*endp = end;
1 year ago
return policy;
3 years ago
}
NV_STATUS uvm_hmm_find_policy_vma_and_outer(uvm_va_block_t *va_block,
1 year ago
struct vm_area_struct **vma_out,
3 years ago
uvm_page_index_t page_index,
1 year ago
const uvm_va_policy_t **policy,
3 years ago
uvm_page_index_t *outerp)
{
unsigned long addr;
2 years ago
NvU64 end;
3 years ago
uvm_page_index_t outer;
1 year ago
uvm_va_space_t *va_space = uvm_va_block_get_va_space(va_block);
struct mm_struct *mm = va_space->va_space_mm.mm;
if (!mm)
return NV_ERR_INVALID_ADDRESS;
3 years ago
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1 year ago
uvm_assert_mmap_lock_locked(mm);
3 years ago
uvm_assert_mutex_locked(&va_block->lock);
addr = uvm_va_block_cpu_page_address(va_block, page_index);
1 year ago
*vma_out = vma_lookup(mm, addr);
if (!*vma_out || !((*vma_out)->vm_flags & VM_READ))
3 years ago
return NV_ERR_INVALID_ADDRESS;
1 year ago
*policy = uvm_hmm_find_policy_end(va_block, *vma_out, addr, &end);
3 years ago
outer = uvm_va_block_cpu_page_index(va_block, end) + 1;
if (*outerp > outer)
*outerp = outer;
return NV_OK;
}
2 years ago
static NV_STATUS hmm_clear_thrashing_policy(uvm_va_block_t *va_block,
uvm_va_block_context_t *block_context)
{
2 years ago
const uvm_va_policy_t *policy;
2 years ago
uvm_va_policy_node_t *node;
uvm_va_block_region_t region;
NV_STATUS status = NV_OK;
uvm_mutex_lock(&va_block->lock);
uvm_for_each_va_policy_in(policy, va_block, va_block->start, va_block->end, node, region) {
// Unmap may split PTEs and require a retry. Needs to be called
// before the pinned pages information is destroyed.
status = UVM_VA_BLOCK_RETRY_LOCKED(va_block,
NULL,
2 years ago
uvm_perf_thrashing_unmap_remote_pinned_pages_all(va_block,
block_context,
region));
2 years ago
uvm_perf_thrashing_info_destroy(va_block);
3 years ago
2 years ago
if (status != NV_OK)
break;
}
uvm_mutex_unlock(&va_block->lock);
return status;
}
NV_STATUS uvm_hmm_clear_thrashing_policy(uvm_va_space_t *va_space)
{
uvm_va_block_context_t *block_context = uvm_va_space_block_context(va_space, NULL);
uvm_range_tree_node_t *node, *next;
uvm_va_block_t *va_block;
NV_STATUS status = NV_OK;
if (!uvm_hmm_is_enabled(va_space))
return NV_OK;
uvm_assert_rwsem_locked_write(&va_space->lock);
uvm_range_tree_for_each_safe(node, next, &va_space->hmm.blocks) {
va_block = hmm_va_block_from_node(node);
status = hmm_clear_thrashing_policy(va_block, block_context);
if (status != NV_OK)
break;
}
return status;
}
2 years ago
uvm_va_block_region_t uvm_hmm_get_prefetch_region(uvm_va_block_t *va_block,
1 year ago
struct vm_area_struct *vma,
const uvm_va_policy_t *policy,
2 years ago
NvU64 address)
{
NvU64 start, end;
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
// We need to limit the prefetch region to the VMA.
start = max(va_block->start, (NvU64)vma->vm_start);
end = min(va_block->end, (NvU64)vma->vm_end - 1);
// Also, we need to limit the prefetch region to the policy range.
2 years ago
if (uvm_va_policy_is_default(policy)) {
2 years ago
NV_STATUS status = uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree,
address,
&start,
&end);
// We already know the hole exists and covers the fault region.
UVM_ASSERT(status == NV_OK);
}
else {
2 years ago
const uvm_va_policy_node_t *node = uvm_va_policy_node_from_policy(policy);
2 years ago
start = max(start, node->node.start);
end = min(end, node->node.end);
}
return uvm_va_block_region_from_start_end(va_block, start, end);
}
uvm_prot_t uvm_hmm_compute_logical_prot(uvm_va_block_t *va_block,
1 year ago
struct vm_area_struct *vma,
2 years ago
NvU64 addr)
{
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
1 year ago
uvm_assert_mmap_lock_locked(va_block->hmm.va_space->va_space_mm.mm);
2 years ago
UVM_ASSERT(vma && addr >= vma->vm_start && addr < vma->vm_end);
if (!(vma->vm_flags & VM_READ))
return UVM_PROT_NONE;
else if (!(vma->vm_flags & VM_WRITE))
return UVM_PROT_READ_ONLY;
else
return UVM_PROT_READ_WRITE_ATOMIC;
}
2 years ago
static NV_STATUS hmm_va_block_cpu_page_populate(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page)
2 years ago
{
2 years ago
uvm_cpu_chunk_t *chunk;
NV_STATUS status;
2 years ago
2 years ago
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
UVM_ASSERT(!uvm_page_mask_test(&va_block->cpu.allocated, page_index));
2 years ago
2 years ago
if (page == ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)))
return NV_ERR_INVALID_ADDRESS;
status = uvm_cpu_chunk_alloc_hmm(page, &chunk);
if (status != NV_OK)
return status;
status = uvm_cpu_chunk_insert_in_block(va_block, chunk, page_index);
if (status != NV_OK) {
uvm_cpu_chunk_free(chunk);
return status;
}
1 year ago
status = uvm_va_block_map_cpu_chunk_on_gpus(va_block, chunk, page_index);
2 years ago
if (status != NV_OK) {
1 year ago
uvm_cpu_chunk_remove_from_block(va_block, page_to_nid(page), page_index);
2 years ago
uvm_cpu_chunk_free(chunk);
}
return status;
2 years ago
}
1 year ago
static void hmm_va_block_cpu_unpopulate_chunk(uvm_va_block_t *va_block,
uvm_cpu_chunk_t *chunk,
int chunk_nid,
uvm_page_index_t page_index)
2 years ago
{
2 years ago
if (!chunk)
return;
2 years ago
2 years ago
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1 year ago
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
UVM_ASSERT(uvm_cpu_chunk_get_size(chunk) == PAGE_SIZE);
2 years ago
1 year ago
uvm_cpu_chunk_remove_from_block(va_block, chunk_nid, page_index);
2 years ago
uvm_va_block_unmap_cpu_chunk_on_gpus(va_block, chunk, page_index);
uvm_cpu_chunk_free(chunk);
2 years ago
}
1 year ago
static void hmm_va_block_cpu_page_unpopulate(uvm_va_block_t *va_block, uvm_page_index_t page_index, struct page *page)
{
uvm_cpu_chunk_t *chunk;
UVM_ASSERT(uvm_va_block_is_hmm(va_block));
if (page) {
chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index);
hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, page_to_nid(page), page_index);
}
else {
int nid;
for_each_possible_uvm_node(nid) {
chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, nid, page_index);
hmm_va_block_cpu_unpopulate_chunk(va_block, chunk, nid, page_index);
}
}
}
2 years ago
static bool hmm_va_block_cpu_page_is_same(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page)
2 years ago
{
1 year ago
struct page *old_page = uvm_va_block_get_cpu_page(va_block, page_index);
2 years ago
1 year ago
UVM_ASSERT(uvm_cpu_chunk_is_hmm(uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(page), page_index)));
2 years ago
return old_page == page;
}
// uvm_va_block_service_copy() and uvm_va_block_service_finish() expect the
// service_context masks to match what is being processed. Since a page
// that was expected to be processed isn't migrating, we have to clear the
// masks to make service_context consistent with what is actually being
// handled.
1 year ago
static void clear_service_context_masks(uvm_service_block_context_t *service_context,
uvm_processor_id_t new_residency,
uvm_page_index_t page_index)
2 years ago
{
1 year ago
uvm_page_mask_clear(&service_context->block_context->caller_page_mask, page_index);
1 year ago
2 years ago
uvm_page_mask_clear(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency,
page_index);
if (uvm_page_mask_empty(&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency))
uvm_processor_mask_clear(&service_context->resident_processors, new_residency);
if (UVM_ID_IS_VALID(service_context->prefetch_hint.residency))
uvm_page_mask_clear(&service_context->prefetch_hint.prefetch_pages_mask, page_index);
if (service_context->thrashing_pin_count > 0 &&
uvm_page_mask_test_and_clear(&service_context->thrashing_pin_mask, page_index)) {
service_context->thrashing_pin_count--;
}
if (service_context->read_duplicate_count > 0 &&
uvm_page_mask_test_and_clear(&service_context->read_duplicate_mask, page_index)) {
service_context->read_duplicate_count--;
}
}
static void cpu_mapping_set(uvm_va_block_t *va_block,
bool is_write,
uvm_page_index_t page_index)
{
uvm_processor_mask_set(&va_block->mapped, UVM_ID_CPU);
uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
if (is_write)
uvm_page_mask_set(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
else
uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
}
static void cpu_mapping_clear(uvm_va_block_t *va_block, uvm_page_index_t page_index)
{
uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_WRITE], page_index);
uvm_page_mask_clear(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ], page_index);
if (uvm_page_mask_empty(&va_block->cpu.pte_bits[UVM_PTE_BITS_CPU_READ]))
uvm_processor_mask_clear(&va_block->mapped, UVM_ID_CPU);
}
static void gpu_chunk_remove(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page)
{
uvm_va_block_gpu_state_t *gpu_state;
uvm_gpu_chunk_t *gpu_chunk;
uvm_gpu_id_t id;
id = uvm_pmm_devmem_page_to_gpu_id(page);
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
UVM_ASSERT(gpu_state);
gpu_chunk = gpu_state->chunks[page_index];
1 year ago
if (!gpu_chunk) {
// If we didn't find a chunk it's because the page was unmapped for
// mremap and no fault has established a new mapping.
UVM_ASSERT(!uvm_page_mask_test(&gpu_state->resident, page_index));
return;
}
2 years ago
// TODO: Bug 3898467: unmap indirect peers when freeing GPU chunks
1 year ago
uvm_mmu_chunk_unmap(gpu_chunk, &va_block->tracker);
2 years ago
gpu_state->chunks[page_index] = NULL;
}
static NV_STATUS gpu_chunk_add(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *page)
{
uvm_va_block_gpu_state_t *gpu_state;
uvm_gpu_chunk_t *gpu_chunk;
uvm_gpu_id_t id;
NV_STATUS status;
id = uvm_pmm_devmem_page_to_gpu_id(page);
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
1 year ago
// It's possible that this is a fresh va_block we're trying to add an
// existing gpu_chunk to. This occurs for example when a GPU faults on a
// virtual address that has been remapped with mremap().
if (!gpu_state) {
status = uvm_va_block_gpu_state_alloc(va_block);
if (status != NV_OK)
return status;
gpu_state = uvm_va_block_gpu_state_get(va_block, id);
}
2 years ago
UVM_ASSERT(gpu_state);
// Note that a mremap() might be to a CPU virtual address that is nolonger
// aligned with a larger GPU chunk size. We would need to allocate a new
// aligned GPU chunk and copy from old to new.
// TODO: Bug 3368756: add support for large GPU pages.
gpu_chunk = uvm_pmm_devmem_page_to_chunk(page);
UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_ALLOCATED);
UVM_ASSERT(gpu_chunk->is_referenced);
UVM_ASSERT(page->zone_device_data == va_block->hmm.va_space);
if (gpu_state->chunks[page_index] == gpu_chunk)
return NV_OK;
UVM_ASSERT(!gpu_state->chunks[page_index]);
// In some configurations such as SR-IOV heavy, the chunk cannot be
// referenced using its physical address. Create a virtual mapping.
status = uvm_mmu_chunk_map(gpu_chunk);
if (status != NV_OK)
return status;
// TODO: Bug 3898467: map indirect peers.
uvm_processor_mask_set(&va_block->resident, id);
uvm_page_mask_set(&gpu_state->resident, page_index);
// It is safe to modify the page index field without holding any PMM locks
// because the chunk is allocated, which means that none of the other
// fields in the bitmap can change.
gpu_chunk->va_block = va_block;
gpu_chunk->va_block_page_index = page_index;
gpu_state->chunks[page_index] = gpu_chunk;
return NV_OK;
}
1 year ago
// This is called just before calling migrate_vma_finalize() in order to wait
// for GPU operations to complete and update the va_block state to match which
// pages migrated (or not) and therefore which pages will be released by
// migrate_vma_finalize().
// 'migrated_pages' is the mask of pages that migrated,
// 'same_devmem_page_mask' is the mask of pages that are the same in src_pfns
// and dst_pfns and therefore appear to migrate_vma_*() to be not migrating.
// 'region' is the page index region of all migrated, non-migrated, and
// same_devmem_page_mask pages.
static NV_STATUS sync_page_and_chunk_state(uvm_va_block_t *va_block,
const unsigned long *src_pfns,
const unsigned long *dst_pfns,
uvm_va_block_region_t region,
const uvm_page_mask_t *migrated_pages,
const uvm_page_mask_t *same_devmem_page_mask)
2 years ago
{
uvm_page_index_t page_index;
1 year ago
NV_STATUS status;
2 years ago
1 year ago
// Wait for the GPU to finish. migrate_vma_finalize() will release the
// migrated source pages (or non migrating destination pages), so GPU
// opererations must be finished by then.
status = uvm_tracker_wait(&va_block->tracker);
2 years ago
1 year ago
for_each_va_block_page_in_region(page_index, region) {
struct page *page;
2 years ago
if (uvm_page_mask_test(same_devmem_page_mask, page_index))
continue;
1 year ago
// If a page migrated, clean up the source page.
// Otherwise, clean up the destination page.
if (uvm_page_mask_test(migrated_pages, page_index))
page = migrate_pfn_to_page(src_pfns[page_index]);
else
page = migrate_pfn_to_page(dst_pfns[page_index]);
2 years ago
1 year ago
if (!page)
2 years ago
continue;
1 year ago
if (is_device_private_page(page)) {
gpu_chunk_remove(va_block, page_index, page);
2 years ago
}
else {
// If the source page is a system memory page,
// migrate_vma_finalize() will release the reference so we should
// clear our pointer to it.
// TODO: Bug 3660922: Need to handle read duplication at some point.
1 year ago
hmm_va_block_cpu_page_unpopulate(va_block, page_index, page);
2 years ago
}
}
1 year ago
return status;
2 years ago
}
// Update va_block state to reflect that the page isn't migrating.
static void clean_up_non_migrating_page(uvm_va_block_t *va_block,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_page_index_t page_index)
{
struct page *dst_page = migrate_pfn_to_page(dst_pfns[page_index]);
if (!dst_page)
return;
// migrate_vma_finalize() will release the dst_page reference so don't keep
// a pointer to it.
if (is_device_private_page(dst_page)) {
gpu_chunk_remove(va_block, page_index, dst_page);
}
else {
UVM_ASSERT(page_ref_count(dst_page) == 1);
1 year ago
hmm_va_block_cpu_page_unpopulate(va_block, page_index, dst_page);
2 years ago
}
unlock_page(dst_page);
put_page(dst_page);
dst_pfns[page_index] = 0;
}
static void clean_up_non_migrating_pages(uvm_va_block_t *va_block,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_va_block_region_t region,
uvm_page_mask_t *page_mask)
{
uvm_page_index_t page_index;
1 year ago
NV_STATUS status;
status = uvm_tracker_wait(&va_block->tracker);
UVM_ASSERT(status == NV_OK);
2 years ago
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
clean_up_non_migrating_page(va_block, src_pfns, dst_pfns, page_index);
}
}
// CPU page fault handling.
// Fill in the dst_pfns[page_index] entry given that there is an allocated
// CPU page.
static void lock_block_cpu_page(uvm_va_block_t *va_block,
uvm_page_index_t page_index,
struct page *src_page,
unsigned long *dst_pfns,
uvm_page_mask_t *same_devmem_page_mask)
{
1 year ago
uvm_cpu_chunk_t *chunk = uvm_cpu_chunk_get_chunk_for_page(va_block, page_to_nid(src_page), page_index);
2 years ago
uvm_va_block_region_t chunk_region;
struct page *dst_page;
UVM_ASSERT(chunk);
UVM_ASSERT(chunk->page);
chunk_region = uvm_va_block_chunk_region(va_block, uvm_cpu_chunk_get_size(chunk), page_index);
dst_page = chunk->page + (page_index - chunk_region.first);
UVM_ASSERT(dst_page != ZERO_PAGE(uvm_va_block_cpu_page_address(va_block, page_index)));
UVM_ASSERT(!is_device_private_page(dst_page));
// The source page is usually a device private page but it could be a GPU
// remote mapped system memory page. It could also be a driver allocated
// page for GPU-to-GPU staged copies (i.e., not a resident copy and owned
// by the driver).
if (is_device_private_page(src_page)) {
// Since the page isn't mirrored, it was allocated by alloc_pages()
// and UVM owns the reference. We leave the reference count unchanged
// and mark the page pointer as mirrored since UVM is transferring
// ownership to Linux and we don't want UVM to double free the page in
// hmm_va_block_cpu_page_unpopulate() or block_kill(). If the page
// does not migrate, it will be freed though.
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1 year ago
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2 years ago
UVM_ASSERT(chunk->type == UVM_CPU_CHUNK_TYPE_PHYSICAL);
UVM_ASSERT(page_ref_count(dst_page) == 1);
uvm_cpu_chunk_make_hmm(chunk);
}
else {
UVM_ASSERT(same_devmem_page_mask);
UVM_ASSERT(src_page == dst_page);
uvm_page_mask_set(same_devmem_page_mask, page_index);
1 year ago
// The call to migrate_vma_setup() will have inserted a migration PTE
// so the CPU has no access.
cpu_mapping_clear(va_block, page_index);
2 years ago
return;
}
lock_page(dst_page);
dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
}
static void hmm_mark_gpu_chunk_referenced(uvm_va_block_t *va_block,
uvm_gpu_t *gpu,
uvm_gpu_chunk_t *gpu_chunk)
{
// Tell PMM to expect a callback from Linux to free the page since the
// device private struct page reference count will determine when the
// GPU chunk is free.
UVM_ASSERT(gpu_chunk->state == UVM_PMM_GPU_CHUNK_STATE_TEMP_PINNED);
list_del_init(&gpu_chunk->list);
uvm_pmm_gpu_unpin_referenced(&gpu->pmm, gpu_chunk, va_block);
}
static void fill_dst_pfn(uvm_va_block_t *va_block,
uvm_gpu_t *gpu,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_page_index_t page_index,
uvm_page_mask_t *same_devmem_page_mask)
{
unsigned long src_pfn = src_pfns[page_index];
uvm_gpu_chunk_t *gpu_chunk;
unsigned long pfn;
struct page *dpage;
gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block, gpu, uvm_va_block_cpu_page_address(va_block, page_index));
UVM_ASSERT(gpu_chunk);
UVM_ASSERT(gpu_chunk->log2_size == PAGE_SHIFT);
pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
// If the same GPU page is both source and destination, migrate_vma_pages()
// will see the wrong "expected" reference count and not migrate it, so we
// mark it as not migrating but we keep track of this so we don't confuse
// it with a page that migrate_vma_pages() actually does not migrate.
if ((src_pfn & MIGRATE_PFN_VALID) && (src_pfn >> MIGRATE_PFN_SHIFT) == pfn) {
uvm_page_mask_set(same_devmem_page_mask, page_index);
return;
}
dpage = pfn_to_page(pfn);
UVM_ASSERT(is_device_private_page(dpage));
UVM_ASSERT(dpage->pgmap->owner == &g_uvm_global);
hmm_mark_gpu_chunk_referenced(va_block, gpu, gpu_chunk);
UVM_ASSERT(!page_count(dpage));
zone_device_page_init(dpage);
dpage->zone_device_data = va_block->hmm.va_space;
dst_pfns[page_index] = migrate_pfn(pfn);
}
static void fill_dst_pfns(uvm_va_block_t *va_block,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_va_block_region_t region,
uvm_page_mask_t *page_mask,
uvm_page_mask_t *same_devmem_page_mask,
uvm_processor_id_t dest_id)
{
uvm_gpu_t *gpu = uvm_va_space_get_gpu(va_block->hmm.va_space, dest_id);
uvm_page_index_t page_index;
uvm_page_mask_zero(same_devmem_page_mask);
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE))
continue;
fill_dst_pfn(va_block,
gpu,
src_pfns,
dst_pfns,
page_index,
same_devmem_page_mask);
}
}
static NV_STATUS alloc_and_copy_to_cpu(uvm_va_block_t *va_block,
struct vm_area_struct *vma,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_va_block_region_t region,
uvm_page_mask_t *page_mask,
uvm_page_mask_t *same_devmem_page_mask,
uvm_processor_id_t processor_id,
uvm_service_block_context_t *service_context)
{
uvm_page_index_t page_index;
NV_STATUS status = NV_OK;
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
struct page *src_page;
struct page *dst_page;
gfp_t gfp;
if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
// Device exclusive PTEs are not selected but we still want to
// process the page so record it as such.
if (service_context && !UVM_ID_IS_CPU(processor_id) &&
service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG) {
uvm_page_mask_set(same_devmem_page_mask, page_index);
continue;
}
1 year ago
// We have previously found a page that is CPU resident which can't
// be migrated (probably a shared mapping) so make sure we establish
// a remote mapping for it.
if (uvm_page_mask_test(same_devmem_page_mask, page_index))
continue;
2 years ago
goto clr_mask;
}
// This is the page that will be copied to system memory.
src_page = migrate_pfn_to_page(src_pfns[page_index]);
if (src_page) {
1 year ago
// mremap may have caused us to loose the gpu_chunk associated with
// this va_block/page_index so make sure we have the correct chunk.
if (is_device_private_page(src_page))
gpu_chunk_add(va_block, page_index, src_page);
2 years ago
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
lock_block_cpu_page(va_block, page_index, src_page, dst_pfns, same_devmem_page_mask);
continue;
}
}
UVM_ASSERT(!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1 year ago
!uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2 years ago
// Allocate a user system memory page for the destination.
// This is the typical case since Linux will free the source page when
// migrating to device private memory.
// If there is no source page, it means the page is pte_none() or the
// zero page. This case "shouldn't happen" because we asked
// migrate_vma_setup() only for device private pages but
// migrate_vma_collect_hole() doesn't check the
// MIGRATE_VMA_SELECT_SYSTEM flag.
gfp = GFP_HIGHUSER_MOVABLE;
if (!src_page)
gfp |= __GFP_ZERO;
dst_page = alloc_page_vma(gfp,
vma,
va_block->start + (page_index << PAGE_SHIFT));
if (!dst_page) {
// Ignore errors if the page is only for prefetching.
if (service_context &&
service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
goto clr_mask;
UVM_ERR_PRINT("cannot allocate page %u (addr 0x%llx)\n",
page_index, va_block->start + (page_index << PAGE_SHIFT));
status = NV_ERR_NO_MEMORY;
break;
}
status = hmm_va_block_cpu_page_populate(va_block, page_index, dst_page);
if (status != NV_OK) {
__free_page(dst_page);
// Ignore errors if the page is only for prefetching.
if (service_context &&
service_context->access_type[page_index] == UVM_FAULT_ACCESS_TYPE_PREFETCH)
goto clr_mask;
break;
}
// Note that we don't call get_page(dst_page) since alloc_page_vma()
// returns with a page reference count of one and we are passing
// ownership to Linux. Also, uvm_va_block_cpu_page_populate() recorded
// the page as "mirrored" so that migrate_vma_finalize() and
// hmm_va_block_cpu_page_unpopulate() don't double free the page.
lock_page(dst_page);
dst_pfns[page_index] = migrate_pfn(page_to_pfn(dst_page));
continue;
clr_mask:
// TODO: Bug 3900774: clean up murky mess of mask clearing.
uvm_page_mask_clear(page_mask, page_index);
1 year ago
if (service_context)
clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2 years ago
}
if (status != NV_OK)
clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
1 year ago
else if (uvm_page_mask_empty(page_mask))
return NV_WARN_MORE_PROCESSING_REQUIRED;
2 years ago
return status;
}
static NV_STATUS uvm_hmm_devmem_fault_alloc_and_copy(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
{
uvm_processor_id_t processor_id;
uvm_service_block_context_t *service_context;
uvm_va_block_retry_t *va_block_retry;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
uvm_page_mask_t *page_mask;
uvm_page_mask_t *same_devmem_page_mask = &devmem_fault_context->same_devmem_page_mask;
uvm_va_block_t *va_block;
NV_STATUS status = NV_OK;
processor_id = devmem_fault_context->processor_id;
service_context = devmem_fault_context->service_context;
va_block_retry = devmem_fault_context->va_block_retry;
va_block = devmem_fault_context->va_block;
1 year ago
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
2 years ago
// Build the migration page mask.
// Note that thrashing pinned pages and prefetch pages are already
// accounted for in service_context->per_processor_masks.
page_mask = &devmem_fault_context->page_mask;
uvm_page_mask_copy(page_mask, &service_context->per_processor_masks[UVM_ID_CPU_VALUE].new_residency);
status = alloc_and_copy_to_cpu(va_block,
1 year ago
service_context->block_context->hmm.vma,
2 years ago
src_pfns,
dst_pfns,
service_context->region,
page_mask,
same_devmem_page_mask,
processor_id,
service_context);
if (status != NV_OK)
return status;
// Do the copy but don't update the residency or mapping for the new
// location yet.
return uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
}
static NV_STATUS uvm_hmm_devmem_fault_finalize_and_map(uvm_hmm_devmem_fault_context_t *devmem_fault_context)
{
uvm_processor_id_t processor_id;
uvm_service_block_context_t *service_context;
uvm_perf_prefetch_hint_t *prefetch_hint;
uvm_va_block_retry_t *va_block_retry;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
uvm_page_mask_t *page_mask;
uvm_va_block_t *va_block;
uvm_va_block_region_t region;
uvm_page_index_t page_index;
NV_STATUS status, tracker_status;
processor_id = devmem_fault_context->processor_id;
service_context = devmem_fault_context->service_context;
prefetch_hint = &service_context->prefetch_hint;
va_block = devmem_fault_context->va_block;
va_block_retry = devmem_fault_context->va_block_retry;
1 year ago
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
2 years ago
region = service_context->region;
page_mask = &devmem_fault_context->page_mask;
// There are a number of reasons why HMM will mark a page as not migrating
// even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
if (src_pfns[page_index] & MIGRATE_PFN_MIGRATE)
continue;
// If a page isn't migrating and only the GPU page table is being
// updated, continue to process it normally.
if (uvm_page_mask_test(&devmem_fault_context->same_devmem_page_mask, page_index))
continue;
// TODO: Bug 3900774: clean up murky mess of mask clearing.
uvm_page_mask_clear(page_mask, page_index);
1 year ago
clear_service_context_masks(service_context, UVM_ID_CPU, page_index);
2 years ago
}
1 year ago
if (uvm_page_mask_empty(page_mask))
status = NV_WARN_MORE_PROCESSING_REQUIRED;
else
status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2 years ago
1 year ago
tracker_status = sync_page_and_chunk_state(va_block,
src_pfns,
dst_pfns,
region,
page_mask,
&devmem_fault_context->same_devmem_page_mask);
2 years ago
1 year ago
return status == NV_OK ? tracker_status : status;
2 years ago
}
static NV_STATUS populate_region(uvm_va_block_t *va_block,
unsigned long *pfns,
1 year ago
uvm_va_block_region_t region,
uvm_page_mask_t *populated_page_mask)
2 years ago
{
uvm_page_index_t page_index;
NV_STATUS status;
// Make sure GPU state is allocated or else the GPU DMA mappings to
// system memory won't be saved.
status = uvm_va_block_gpu_state_alloc(va_block);
if (status != NV_OK)
return status;
for_each_va_block_page_in_region(page_index, region) {
struct page *page;
// This case should only happen when querying CPU residency and we ask
// for something not covered by a VMA. Otherwise, hmm_range_fault()
// returns -EFAULT instead of setting the HMM_PFN_ERROR bit.
if (pfns[page_index] & HMM_PFN_ERROR)
return NV_ERR_INVALID_ADDRESS;
if (pfns[page_index] & HMM_PFN_VALID) {
page = hmm_pfn_to_page(pfns[page_index]);
}
else {
// The page can't be evicted since it has to be migrated to the GPU
// first which would leave a device private page entry so this has
// to be a pte_none(), swapped out, or similar entry.
// The page would have been allocated if populate_region() is being
// called from uvm_hmm_va_block_service_locked() so this must be
// for uvm_hmm_va_block_update_residency_info(). Just leave the
// residency/populated information unchanged since
// uvm_hmm_invalidate() should handle that if the underlying page
// is invalidated.
// Also note there can be an allocated page due to GPU-to-GPU
// migration between non-peer or indirect peer GPUs.
continue;
}
if (is_device_private_page(page)) {
// Linux can call hmm_invalidate() and we have to clear the GPU
// chunk pointer in uvm_va_block_gpu_state_t::chunks[] but it might
// not release the device private struct page reference. Since
// hmm_range_fault() did find a device private PTE, we can
// re-establish the GPU chunk pointer.
status = gpu_chunk_add(va_block, page_index, page);
if (status != NV_OK)
return status;
continue;
}
// If a CPU chunk is already allocated, check to see it matches what
// hmm_range_fault() found.
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
}
else {
status = hmm_va_block_cpu_page_populate(va_block, page_index, page);
if (status != NV_OK)
return status;
1 year ago
// Record that we populated this page. hmm_block_cpu_fault_locked()
// uses this to ensure pages that don't migrate get remote mapped.
if (populated_page_mask)
uvm_page_mask_set(populated_page_mask, page_index);
2 years ago
}
// Since we have a stable snapshot of the CPU pages, we can
// update the residency and protection information.
1 year ago
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
2 years ago
cpu_mapping_set(va_block, pfns[page_index] & HMM_PFN_WRITE, page_index);
}
return NV_OK;
}
1 year ago
static void hmm_range_fault_begin(uvm_va_block_t *va_block)
{
uvm_thread_context_t *uvm_context = uvm_thread_context();
uvm_assert_mutex_locked(&va_block->lock);
uvm_context->hmm_invalidate_seqnum = va_block->hmm.changed;
}
static bool hmm_range_fault_retry(uvm_va_block_t *va_block)
2 years ago
{
uvm_thread_context_t *uvm_context = uvm_thread_context();
1 year ago
uvm_assert_mutex_locked(&va_block->lock);
return uvm_context->hmm_invalidate_seqnum != va_block->hmm.changed;
}
// Make the region be resident on the CPU by calling hmm_range_fault() to fault
// in CPU pages.
static NV_STATUS hmm_make_resident_cpu(uvm_va_block_t *va_block,
struct vm_area_struct *vma,
unsigned long *hmm_pfns,
uvm_va_block_region_t region,
NvU8 *access_type,
uvm_page_mask_t *populated_page_mask)
{
2 years ago
uvm_page_index_t page_index;
int ret;
struct hmm_range range = {
.notifier = &va_block->hmm.notifier,
.start = uvm_va_block_region_start(va_block, region),
.end = uvm_va_block_region_end(va_block, region) + 1,
.hmm_pfns = hmm_pfns + region.first,
.pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE,
.dev_private_owner = &g_uvm_global,
};
for_each_va_block_page_in_region(page_index, region) {
1 year ago
if ((access_type && access_type[page_index] >= UVM_FAULT_ACCESS_TYPE_WRITE) ||
(vma->vm_flags & VM_WRITE))
2 years ago
hmm_pfns[page_index] = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
else
hmm_pfns[page_index] = HMM_PFN_REQ_FAULT;
}
1 year ago
hmm_range_fault_begin(va_block);
2 years ago
// Mirror the VA block to the HMM address range.
// Note that we request HMM to handle page faults, which means that it will
// populate and map potentially not-yet-existing pages to the VMA.
// Also note that mmu_interval_read_begin() calls wait_event() for any
// parallel invalidation callbacks to finish so we can't hold locks that
// the invalidation callback acquires.
uvm_mutex_unlock(&va_block->lock);
range.notifier_seq = mmu_interval_read_begin(range.notifier);
ret = hmm_range_fault(&range);
uvm_mutex_lock(&va_block->lock);
if (ret)
return (ret == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(ret);
1 year ago
if (hmm_range_fault_retry(va_block))
2 years ago
return NV_WARN_MORE_PROCESSING_REQUIRED;
return populate_region(va_block,
1 year ago
hmm_pfns,
region,
populated_page_mask);
2 years ago
}
// Release the reference count on any pages that were made device exclusive.
static void hmm_release_atomic_pages(uvm_va_block_t *va_block,
uvm_service_block_context_t *service_context)
{
uvm_va_block_region_t region = service_context->region;
uvm_page_index_t page_index;
for_each_va_block_page_in_region(page_index, region) {
1 year ago
struct page *page = service_context->block_context->hmm.pages[page_index];
2 years ago
if (!page)
continue;
unlock_page(page);
put_page(page);
}
}
static NV_STATUS hmm_block_atomic_fault_locked(uvm_processor_id_t processor_id,
uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
uvm_service_block_context_t *service_context)
{
uvm_va_block_region_t region = service_context->region;
1 year ago
struct page **pages = service_context->block_context->hmm.pages;
2 years ago
int npages;
uvm_page_index_t page_index;
uvm_make_resident_cause_t cause;
NV_STATUS status;
if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1 year ago
!uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
2 years ago
// There is an atomic GPU fault. We need to make sure no pages are
// GPU resident so that make_device_exclusive_range() doesn't call
// migrate_to_ram() and cause a va_space lock recursion problem.
if (service_context->operation == UVM_SERVICE_OPERATION_REPLAYABLE_FAULTS)
cause = UVM_MAKE_RESIDENT_CAUSE_REPLAYABLE_FAULT;
else if (service_context->operation == UVM_SERVICE_OPERATION_NON_REPLAYABLE_FAULTS)
cause = UVM_MAKE_RESIDENT_CAUSE_NON_REPLAYABLE_FAULT;
else
cause = UVM_MAKE_RESIDENT_CAUSE_ACCESS_COUNTER;
1 year ago
status = uvm_hmm_va_block_migrate_locked(va_block,
va_block_retry,
1 year ago
service_context->block_context,
1 year ago
UVM_ID_CPU,
region,
cause);
2 years ago
if (status != NV_OK)
1 year ago
goto done;
2 years ago
// make_device_exclusive_range() will try to call migrate_to_ram()
// and deadlock with ourself if the data isn't CPU resident.
if (!uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU) ||
1 year ago
!uvm_va_block_cpu_is_region_resident_on(va_block, NUMA_NO_NODE, region)) {
2 years ago
status = NV_WARN_MORE_PROCESSING_REQUIRED;
1 year ago
goto done;
2 years ago
}
}
1 year ago
// TODO: Bug 4014681: atomic GPU operations are not supported on MAP_SHARED
// mmap() files so we check for that here and report a fatal fault.
// Otherwise with the current Linux 6.1 make_device_exclusive_range(),
// it doesn't make the page exclusive and we end up in an endless loop.
1 year ago
if (service_context->block_context->hmm.vma->vm_flags & (VM_SHARED | VM_HUGETLB)) {
1 year ago
status = NV_ERR_NOT_SUPPORTED;
goto done;
}
2 years ago
1 year ago
hmm_range_fault_begin(va_block);
2 years ago
1 year ago
uvm_mutex_unlock(&va_block->lock);
2 years ago
1 year ago
npages = make_device_exclusive_range(service_context->block_context->mm,
2 years ago
uvm_va_block_cpu_page_address(va_block, region.first),
uvm_va_block_cpu_page_address(va_block, region.outer - 1) + PAGE_SIZE,
pages + region.first,
&g_uvm_global);
uvm_mutex_lock(&va_block->lock);
if (npages < 0) {
status = (npages == -EBUSY) ? NV_WARN_MORE_PROCESSING_REQUIRED : errno_to_nv_status(npages);
1 year ago
goto done;
2 years ago
}
while ((size_t)npages < uvm_va_block_region_num_pages(region))
pages[region.first + npages++] = NULL;
1 year ago
if (hmm_range_fault_retry(va_block)) {
2 years ago
status = NV_WARN_MORE_PROCESSING_REQUIRED;
goto release;
}
status = NV_OK;
for_each_va_block_page_in_region(page_index, region) {
struct page *page = pages[page_index];
if (!page) {
// Record that one of the pages isn't exclusive but keep converting
// the others.
status = NV_WARN_MORE_PROCESSING_REQUIRED;
continue;
}
// If a CPU chunk is already allocated, check to see it matches what
// make_device_exclusive_range() found.
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, page));
UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
1 year ago
UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2 years ago
}
else {
NV_STATUS s = hmm_va_block_cpu_page_populate(va_block, page_index, page);
1 year ago
if (s == NV_OK)
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(page), page_index);
2 years ago
}
cpu_mapping_clear(va_block, page_index);
}
if (status != NV_OK)
goto release;
status = uvm_va_block_service_copy(processor_id, UVM_ID_CPU, va_block, va_block_retry, service_context);
if (status != NV_OK)
goto release;
status = uvm_va_block_service_finish(processor_id, va_block, service_context);
release:
hmm_release_atomic_pages(va_block, service_context);
1 year ago
done:
2 years ago
return status;
}
static bool is_atomic_fault(NvU8 *access_type, uvm_va_block_region_t region)
{
uvm_page_index_t page_index;
for_each_va_block_page_in_region(page_index, region) {
if (access_type[page_index] == UVM_FAULT_ACCESS_TYPE_ATOMIC_STRONG)
return true;
}
return false;
}
1 year ago
static bool is_gpu_resident(uvm_va_block_t *va_block, uvm_va_block_region_t region)
{
uvm_processor_id_t gpu_id;
for_each_gpu_id_in_mask(gpu_id, &va_block->resident) {
uvm_va_block_gpu_state_t *gpu_state;
gpu_state = uvm_va_block_gpu_state_get(va_block, gpu_id);
if (!uvm_page_mask_region_empty(&gpu_state->resident, region))
return true;
}
return false;
}
2 years ago
static NV_STATUS hmm_block_cpu_fault_locked(uvm_processor_id_t processor_id,
uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
uvm_service_block_context_t *service_context)
{
uvm_va_block_region_t region = service_context->region;
1 year ago
struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
2 years ago
NV_STATUS status;
int ret;
uvm_hmm_devmem_fault_context_t fault_context = {
.processor_id = processor_id,
.va_block = va_block,
.va_block_retry = va_block_retry,
.service_context = service_context,
};
1 year ago
// Normally the source page will be a device private page that is being
// migrated to system memory. However, when it is a GPU fault, the source
// page can be a system memory page that the GPU needs to remote map
// instead. However migrate_vma_setup() won't select these types of
// mappings/pages:
// - device exclusive PTEs
// - shared mappings
// - file backed mappings
// Also, if the source and destination page are the same, the page reference
// count won't be the "expected" count and migrate_vma_pages() won't migrate
// it. This mask records that uvm_hmm_devmem_fault_alloc_and_copy() and
// uvm_hmm_devmem_fault_finalize_and_map() still needs to process these
// pages even if src_pfn indicates they are not migrating.
uvm_page_mask_zero(&fault_context.same_devmem_page_mask);
2 years ago
if (!UVM_ID_IS_CPU(processor_id)) {
if (is_atomic_fault(service_context->access_type, region)) {
return hmm_block_atomic_fault_locked(processor_id,
va_block,
va_block_retry,
service_context);
}
1 year ago
status = hmm_make_resident_cpu(va_block,
1 year ago
service_context->block_context->hmm.vma,
service_context->block_context->hmm.src_pfns,
1 year ago
region,
service_context->access_type,
&fault_context.same_devmem_page_mask);
2 years ago
if (status != NV_OK)
return status;
1 year ago
// If no GPU has a resident copy, we can skip the migrate_vma_*().
// This is necessary if uvm_hmm_must_use_sysmem() returned true.
if (!is_gpu_resident(va_block, region)) {
status = uvm_va_block_service_copy(processor_id,
UVM_ID_CPU,
va_block,
va_block_retry,
service_context);
if (status != NV_OK)
return status;
return uvm_va_block_service_finish(processor_id, va_block, service_context);
}
2 years ago
}
1 year ago
args->vma = service_context->block_context->hmm.vma;
args->src = service_context->block_context->hmm.src_pfns + region.first;
args->dst = service_context->block_context->hmm.dst_pfns + region.first;
1 year ago
args->start = uvm_va_block_region_start(va_block, region);
args->end = uvm_va_block_region_end(va_block, region) + 1;
args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
args->pgmap_owner = &g_uvm_global;
2 years ago
if (UVM_ID_IS_CPU(processor_id)) {
1 year ago
args->fault_page = service_context->cpu_fault.vmf->page;
2 years ago
}
else {
1 year ago
args->flags |= MIGRATE_VMA_SELECT_SYSTEM;
2 years ago
args->fault_page = NULL;
}
1 year ago
ret = migrate_vma_setup_locked(args, va_block);
UVM_ASSERT(!ret);
2 years ago
// The overall process here is to migrate pages from the GPU to the CPU
// and possibly remote map the GPU to sysmem if accessed_by is set.
// This is safe because we hold the va_block lock across the calls to
// uvm_hmm_devmem_fault_alloc_and_copy(), migrate_vma_pages(),
// uvm_hmm_devmem_fault_finalize_and_map(), and migrate_vma_finalize().
// If uvm_hmm_devmem_fault_alloc_and_copy() needs to drop the va_block
// lock, a sequence number is used to tell if an invalidate() callback
// occurred while not holding the lock. If the sequence number changes,
// all the locks need to be dropped (mm, va_space, va_block) and the whole
// uvm_va_block_service_locked() called again. Otherwise, there were no
// conflicting invalidate callbacks and our snapshots of the CPU page
// tables are accurate and can be used to DMA pages and update GPU page
// tables.
status = uvm_hmm_devmem_fault_alloc_and_copy(&fault_context);
if (status == NV_OK) {
migrate_vma_pages(args);
status = uvm_hmm_devmem_fault_finalize_and_map(&fault_context);
}
migrate_vma_finalize(args);
if (status == NV_WARN_NOTHING_TO_DO)
status = NV_OK;
return status;
}
static NV_STATUS dmamap_src_sysmem_pages(uvm_va_block_t *va_block,
struct vm_area_struct *vma,
const unsigned long *src_pfns,
unsigned long *dst_pfns,
uvm_va_block_region_t region,
uvm_page_mask_t *page_mask,
uvm_processor_id_t dest_id,
uvm_service_block_context_t *service_context)
{
uvm_page_index_t page_index;
NV_STATUS status = NV_OK;
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
struct page *src_page;
if (!(src_pfns[page_index] & MIGRATE_PFN_MIGRATE)) {
// HMM currently has some limitations on what pages can be migrated.
// For example, no file backed pages, device private pages owned by
// a different device, device exclusive or swapped out pages.
goto clr_mask;
}
// This is the page that will be copied to the destination GPU.
src_page = migrate_pfn_to_page(src_pfns[page_index]);
if (src_page) {
if (is_device_private_page(src_page)) {
status = gpu_chunk_add(va_block, page_index, src_page);
if (status != NV_OK)
break;
continue;
}
1 year ago
if (PageSwapCache(src_page)) {
1 year ago
// TODO: Bug 4050579: Remove this when swap cached pages can be
// migrated.
if (service_context) {
1 year ago
service_context->block_context->hmm.swap_cached = true;
1 year ago
break;
}
goto clr_mask;
}
2 years ago
// If the page is already allocated, it is most likely a mirrored
// page. Check to be sure it matches what we have recorded. The
// page shouldn't be a staging page from a GPU to GPU migration
// or a remote mapped atomic sysmem page because migrate_vma_setup()
// found a normal page and non-mirrored pages are only known
// privately to the UVM driver.
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(hmm_va_block_cpu_page_is_same(va_block, page_index, src_page));
UVM_ASSERT(uvm_processor_mask_test(&va_block->resident, UVM_ID_CPU));
1 year ago
UVM_ASSERT(uvm_va_block_cpu_is_page_resident_on(va_block, NUMA_NO_NODE, page_index));
2 years ago
}
else {
status = hmm_va_block_cpu_page_populate(va_block, page_index, src_page);
if (status != NV_OK)
goto clr_mask;
// Since there is a CPU resident page, there shouldn't be one
// anywhere else. TODO: Bug 3660922: Need to handle read
// duplication at some point.
UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
// migrate_vma_setup() was able to isolate and lock the page;
// therefore, it is CPU resident and not mapped.
1 year ago
uvm_va_block_cpu_set_resident_page(va_block, page_to_nid(src_page), page_index);
2 years ago
}
// The call to migrate_vma_setup() will have inserted a migration
// PTE so the CPU has no access.
cpu_mapping_clear(va_block, page_index);
}
else {
// It is OK to migrate an empty anonymous page, a zero page will
// be allocated on the GPU. Just be sure to free any pages
// used for GPU to GPU copies. It can't be an evicted page because
// migrate_vma_setup() would have found a source page.
if (uvm_page_mask_test(&va_block->cpu.allocated, page_index)) {
UVM_ASSERT(!uvm_va_block_page_resident_processors_count(va_block, page_index));
1 year ago
hmm_va_block_cpu_page_unpopulate(va_block, page_index, NULL);
2 years ago
}
}
continue;
clr_mask:
// TODO: Bug 3900774: clean up murky mess of mask clearing.
uvm_page_mask_clear(page_mask, page_index);
1 year ago
if (service_context)
clear_service_context_masks(service_context, dest_id, page_index);
2 years ago
}
1 year ago
if (uvm_page_mask_empty(page_mask) ||
1 year ago
(service_context && service_context->block_context->hmm.swap_cached))
1 year ago
status = NV_WARN_MORE_PROCESSING_REQUIRED;
2 years ago
if (status != NV_OK)
clean_up_non_migrating_pages(va_block, src_pfns, dst_pfns, region, page_mask);
return status;
}
static NV_STATUS uvm_hmm_gpu_fault_alloc_and_copy(struct vm_area_struct *vma,
uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
{
uvm_processor_id_t processor_id;
uvm_processor_id_t new_residency;
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_service_block_context_t *service_context;
uvm_perf_prefetch_hint_t *prefetch_hint;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
uvm_va_block_region_t region;
uvm_page_mask_t *page_mask;
NV_STATUS status;
processor_id = uvm_hmm_gpu_fault_event->processor_id;
new_residency = uvm_hmm_gpu_fault_event->new_residency;
va_block = uvm_hmm_gpu_fault_event->va_block;
va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
service_context = uvm_hmm_gpu_fault_event->service_context;
region = service_context->region;
prefetch_hint = &service_context->prefetch_hint;
1 year ago
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
2 years ago
// Build the migration mask.
// Note that thrashing pinned pages are already accounted for in
// service_context->resident_processors.
page_mask = &uvm_hmm_gpu_fault_event->page_mask;
uvm_page_mask_copy(page_mask,
&service_context->per_processor_masks[uvm_id_value(new_residency)].new_residency);
status = dmamap_src_sysmem_pages(va_block,
vma,
src_pfns,
dst_pfns,
region,
page_mask,
new_residency,
service_context);
if (status != NV_OK)
return status;
// Do the alloc and copy but don't update the residency or mapping for the
// new location yet.
status = uvm_va_block_service_copy(processor_id, new_residency, va_block, va_block_retry, service_context);
if (status != NV_OK)
return status;
// Record the destination PFNs of device private struct pages now that
// uvm_va_block_service_copy() has populated the GPU destination pages.
fill_dst_pfns(va_block,
src_pfns,
dst_pfns,
region,
page_mask,
&uvm_hmm_gpu_fault_event->same_devmem_page_mask,
new_residency);
return status;
}
static NV_STATUS uvm_hmm_gpu_fault_finalize_and_map(uvm_hmm_gpu_fault_event_t *uvm_hmm_gpu_fault_event)
{
uvm_processor_id_t processor_id;
uvm_processor_id_t new_residency;
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_service_block_context_t *service_context;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
uvm_va_block_region_t region;
uvm_page_index_t page_index;
uvm_page_mask_t *page_mask;
1 year ago
NV_STATUS status, tracker_status;
2 years ago
processor_id = uvm_hmm_gpu_fault_event->processor_id;
new_residency = uvm_hmm_gpu_fault_event->new_residency;
va_block = uvm_hmm_gpu_fault_event->va_block;
va_block_retry = uvm_hmm_gpu_fault_event->va_block_retry;
service_context = uvm_hmm_gpu_fault_event->service_context;
1 year ago
src_pfns = service_context->block_context->hmm.src_pfns;
dst_pfns = service_context->block_context->hmm.dst_pfns;
2 years ago
region = service_context->region;
page_mask = &uvm_hmm_gpu_fault_event->page_mask;
// There are a number of reasons why HMM will mark a page as not migrating
// even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
unsigned long src_pfn = src_pfns[page_index];
if (src_pfn & MIGRATE_PFN_MIGRATE)
continue;
// If a device private page isn't migrating and only the GPU page table
// is being updated, continue to process it normally.
if (uvm_page_mask_test(&uvm_hmm_gpu_fault_event->same_devmem_page_mask, page_index))
continue;
// TODO: Bug 3900774: clean up murky mess of mask clearing.
uvm_page_mask_clear(page_mask, page_index);
1 year ago
clear_service_context_masks(service_context, new_residency, page_index);
2 years ago
}
1 year ago
if (uvm_page_mask_empty(page_mask))
status = NV_WARN_MORE_PROCESSING_REQUIRED;
else
status = uvm_va_block_service_finish(processor_id, va_block, service_context);
2 years ago
1 year ago
tracker_status = sync_page_and_chunk_state(va_block,
src_pfns,
dst_pfns,
region,
page_mask,
&uvm_hmm_gpu_fault_event->same_devmem_page_mask);
2 years ago
1 year ago
return status == NV_OK ? tracker_status : status;
2 years ago
}
NV_STATUS uvm_hmm_va_block_service_locked(uvm_processor_id_t processor_id,
uvm_processor_id_t new_residency,
uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
uvm_service_block_context_t *service_context)
{
1 year ago
struct mm_struct *mm = service_context->block_context->mm;
struct vm_area_struct *vma = service_context->block_context->hmm.vma;
2 years ago
uvm_va_block_region_t region = service_context->region;
uvm_hmm_gpu_fault_event_t uvm_hmm_gpu_fault_event;
1 year ago
struct migrate_vma *args = &service_context->block_context->hmm.migrate_vma_args;
2 years ago
int ret;
NV_STATUS status = NV_ERR_INVALID_ADDRESS;
if (!mm)
return status;
uvm_assert_mmap_lock_locked(mm);
1 year ago
uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2 years ago
uvm_assert_mutex_locked(&va_block->lock);
UVM_ASSERT(vma);
// If the desired destination is the CPU, try to fault in CPU pages.
if (UVM_ID_IS_CPU(new_residency))
return hmm_block_cpu_fault_locked(processor_id, va_block, va_block_retry, service_context);
uvm_hmm_gpu_fault_event.processor_id = processor_id;
uvm_hmm_gpu_fault_event.new_residency = new_residency;
uvm_hmm_gpu_fault_event.va_block = va_block;
uvm_hmm_gpu_fault_event.va_block_retry = va_block_retry;
uvm_hmm_gpu_fault_event.service_context = service_context;
args->vma = vma;
1 year ago
args->src = service_context->block_context->hmm.src_pfns + region.first;
args->dst = service_context->block_context->hmm.dst_pfns + region.first;
2 years ago
args->start = uvm_va_block_region_start(va_block, region);
args->end = uvm_va_block_region_end(va_block, region) + 1;
args->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
args->pgmap_owner = &g_uvm_global;
args->fault_page = NULL;
1 year ago
ret = migrate_vma_setup_locked(args, va_block);
2 years ago
UVM_ASSERT(!ret);
// The overall process here is to migrate pages from the CPU or GPUs to the
// faulting GPU.
// This is safe because we hold the va_block lock across the calls to
// uvm_hmm_gpu_fault_alloc_and_copy(), migrate_vma_pages(),
// uvm_hmm_gpu_fault_finalize_and_map(), and migrate_vma_finalize().
// If uvm_hmm_gpu_fault_alloc_and_copy() needs to drop the va_block
// lock, a sequence number is used to tell if an invalidate() callback
// occurred while not holding the lock. If the sequence number changes,
// all the locks need to be dropped (mm, va_space, va_block) and the whole
// uvm_va_block_service_locked() called again. Otherwise, there were no
// conflicting invalidate callbacks and our snapshots of the CPU page
// tables are accurate and can be used to DMA pages and update GPU page
// tables. TODO: Bug 3901904: there might be better ways of handling no
// page being migrated.
status = uvm_hmm_gpu_fault_alloc_and_copy(vma, &uvm_hmm_gpu_fault_event);
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
migrate_vma_finalize(args);
// migrate_vma_setup() might have not been able to lock/isolate any
// pages because they are swapped out or are device exclusive.
// We do know that none of the pages in the region are zero pages
// since migrate_vma_setup() would have reported that information.
// Try to make it resident in system memory and retry the migration.
1 year ago
status = hmm_make_resident_cpu(va_block,
1 year ago
service_context->block_context->hmm.vma,
service_context->block_context->hmm.src_pfns,
1 year ago
region,
service_context->access_type,
NULL);
2 years ago
return NV_WARN_MORE_PROCESSING_REQUIRED;
}
if (status == NV_OK) {
migrate_vma_pages(args);
status = uvm_hmm_gpu_fault_finalize_and_map(&uvm_hmm_gpu_fault_event);
}
migrate_vma_finalize(args);
if (status == NV_WARN_NOTHING_TO_DO)
status = NV_OK;
return status;
}
static NV_STATUS uvm_hmm_migrate_alloc_and_copy(struct vm_area_struct *vma,
uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
{
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_va_block_context_t *va_block_context;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
uvm_va_block_region_t region;
uvm_processor_id_t dest_id;
uvm_page_mask_t *page_mask;
NV_STATUS status;
va_block = uvm_hmm_migrate_event->va_block;
va_block_retry = uvm_hmm_migrate_event->va_block_retry;
va_block_context = uvm_hmm_migrate_event->va_block_context;
src_pfns = va_block_context->hmm.src_pfns;
dst_pfns = va_block_context->hmm.dst_pfns;
region = uvm_hmm_migrate_event->region;
dest_id = uvm_hmm_migrate_event->dest_id;
page_mask = &uvm_hmm_migrate_event->page_mask;
uvm_page_mask_init_from_region(page_mask, region, NULL);
1 year ago
uvm_page_mask_zero(&uvm_hmm_migrate_event->same_devmem_page_mask);
2 years ago
uvm_assert_mutex_locked(&va_block->lock);
if (UVM_ID_IS_CPU(dest_id)) {
status = alloc_and_copy_to_cpu(va_block,
vma,
src_pfns,
dst_pfns,
region,
page_mask,
&uvm_hmm_migrate_event->same_devmem_page_mask,
UVM_ID_INVALID,
NULL);
}
else {
status = dmamap_src_sysmem_pages(va_block,
vma,
src_pfns,
dst_pfns,
region,
page_mask,
dest_id,
NULL);
}
if (status != NV_OK)
return status;
status = uvm_va_block_make_resident_copy(va_block,
va_block_retry,
va_block_context,
dest_id,
region,
page_mask,
NULL,
uvm_hmm_migrate_event->cause);
if (status != NV_OK)
return status;
if (!UVM_ID_IS_CPU(dest_id)) {
// Record the destination PFNs of device private struct pages now that
// uvm_va_block_make_resident_copy() has populated the GPU destination
// pages.
fill_dst_pfns(va_block,
src_pfns,
dst_pfns,
region,
page_mask,
&uvm_hmm_migrate_event->same_devmem_page_mask,
dest_id);
}
return status;
}
static NV_STATUS uvm_hmm_migrate_finalize(uvm_hmm_migrate_event_t *uvm_hmm_migrate_event)
{
uvm_va_block_t *va_block;
uvm_va_block_retry_t *va_block_retry;
uvm_va_block_context_t *va_block_context;
uvm_va_block_region_t region;
uvm_processor_id_t dest_id;
uvm_page_index_t page_index;
uvm_page_mask_t *page_mask;
const unsigned long *src_pfns;
unsigned long *dst_pfns;
va_block = uvm_hmm_migrate_event->va_block;
va_block_retry = uvm_hmm_migrate_event->va_block_retry;
va_block_context = uvm_hmm_migrate_event->va_block_context;
region = uvm_hmm_migrate_event->region;
dest_id = uvm_hmm_migrate_event->dest_id;
page_mask = &uvm_hmm_migrate_event->page_mask;
src_pfns = va_block_context->hmm.src_pfns;
dst_pfns = va_block_context->hmm.dst_pfns;
uvm_assert_mutex_locked(&va_block->lock);
// There are a number of reasons why HMM will mark a page as not migrating
// even if we set a valid entry in dst_pfns[]. Mark these pages accordingly.
for_each_va_block_page_in_region_mask(page_index, page_mask, region) {
unsigned long src_pfn = src_pfns[page_index];
if (src_pfn & MIGRATE_PFN_MIGRATE)
continue;
// If a device private page isn't migrating and only the GPU page table
// is being updated, continue to process it normally.
if (uvm_page_mask_test(&uvm_hmm_migrate_event->same_devmem_page_mask, page_index))
continue;
uvm_page_mask_clear(page_mask, page_index);
}
uvm_va_block_make_resident_finish(va_block, va_block_context, region, page_mask);
1 year ago
return sync_page_and_chunk_state(va_block,
src_pfns,
dst_pfns,
region,
page_mask,
&uvm_hmm_migrate_event->same_devmem_page_mask);
2 years ago
}
// Note that migrate_vma_*() doesn't handle asynchronous migrations so the
// migration flag UVM_MIGRATE_FLAG_SKIP_CPU_MAP doesn't have an effect.
// TODO: Bug 3900785: investigate ways to implement async migration.
1 year ago
NV_STATUS uvm_hmm_va_block_migrate_locked(uvm_va_block_t *va_block,
uvm_va_block_retry_t *va_block_retry,
uvm_va_block_context_t *va_block_context,
uvm_processor_id_t dest_id,
uvm_va_block_region_t region,
uvm_make_resident_cause_t cause)
2 years ago
{
uvm_hmm_migrate_event_t uvm_hmm_migrate_event;
struct vm_area_struct *vma = va_block_context->hmm.vma;
NvU64 start;
NvU64 end;
struct migrate_vma *args = &va_block_context->hmm.migrate_vma_args;
NV_STATUS status;
int ret;
UVM_ASSERT(vma);
UVM_ASSERT(va_block_context->mm == vma->vm_mm);
uvm_assert_mmap_lock_locked(va_block_context->mm);
1 year ago
uvm_assert_rwsem_locked(&va_block->hmm.va_space->lock);
uvm_assert_mutex_locked(&va_block->hmm.migrate_lock);
2 years ago
uvm_assert_mutex_locked(&va_block->lock);
start = uvm_va_block_region_start(va_block, region);
end = uvm_va_block_region_end(va_block, region);
UVM_ASSERT(vma->vm_start <= start && end < vma->vm_end);
uvm_hmm_migrate_event.va_block = va_block;
uvm_hmm_migrate_event.va_block_retry = va_block_retry;
uvm_hmm_migrate_event.va_block_context = va_block_context;
uvm_hmm_migrate_event.region = region;
uvm_hmm_migrate_event.dest_id = dest_id;
uvm_hmm_migrate_event.cause = cause;
args->vma = vma;
args->src = va_block_context->hmm.src_pfns + region.first;
args->dst = va_block_context->hmm.dst_pfns + region.first;
args->start = uvm_va_block_region_start(va_block, region);
args->end = uvm_va_block_region_end(va_block, region) + 1;
args->flags = UVM_ID_IS_CPU(dest_id) ? MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
MIGRATE_VMA_SELECT_DEVICE_PRIVATE | MIGRATE_VMA_SELECT_SYSTEM;
args->pgmap_owner = &g_uvm_global;
args->fault_page = NULL;
1 year ago
// Note that migrate_vma_setup() doesn't handle file backed or VM_SPECIAL
// VMAs so if UvmMigrate() tries to migrate such a region, -EINVAL will
// be returned and we will only try to make the pages be CPU resident.
ret = migrate_vma_setup_locked(args, va_block);
if (ret)
return hmm_make_resident_cpu(va_block,
vma,
va_block_context->hmm.src_pfns,
region,
NULL,
NULL);
2 years ago
1 year ago
// The overall process here is to migrate pages from the CPU or GPUs to the
// destination processor. Note that block_migrate_add_mappings() handles
// updating GPU mappings after the migration.
// This is safe because we hold the va_block lock across the calls to
// uvm_hmm_migrate_alloc_and_copy(), migrate_vma_pages(),
// uvm_hmm_migrate_finalize(), migrate_vma_finalize() and
// block_migrate_add_mappings().
// If uvm_hmm_migrate_alloc_and_copy() needs to drop the va_block
// lock, a sequence number is used to tell if an invalidate() callback
// occurred while not holding the lock. If the sequence number changes,
// all the locks need to be dropped (mm, va_space, va_block) and the whole
// uvm_hmm_va_block_migrate_locked() called again. Otherwise, there were no
// conflicting invalidate callbacks and our snapshots of the CPU page
// tables are accurate and can be used to DMA pages and update GPU page
// tables.
status = uvm_hmm_migrate_alloc_and_copy(vma, &uvm_hmm_migrate_event);
if (status == NV_WARN_MORE_PROCESSING_REQUIRED) {
2 years ago
uvm_processor_id_t id;
uvm_page_mask_t *page_mask;
migrate_vma_finalize(args);
// The CPU pages tables might contain only device private pages or
// the migrate_vma_setup() might have not been able to lock/isolate
// any pages because they are swapped out, or on another device.
// We do know that none of the pages in the region are zero pages
// since migrate_vma_setup() would have reported that information.
// Collect all the pages that need to be faulted in and made CPU
// resident, then do the hmm_range_fault() and retry.
page_mask = &va_block_context->caller_page_mask;
uvm_page_mask_init_from_region(page_mask, region, NULL);
for_each_id_in_mask(id, &va_block->resident) {
1 year ago
if (!uvm_page_mask_andnot(page_mask, page_mask, uvm_va_block_resident_mask_get(va_block, id, NUMA_NO_NODE)))
2 years ago
return NV_OK;
}
1 year ago
return hmm_make_resident_cpu(va_block,
vma,
va_block_context->hmm.src_pfns,
region,
NULL,
NULL);
2 years ago
}
if (status == NV_OK) {
migrate_vma_pages(args);
status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
}
migrate_vma_finalize(args);
if (status == NV_WARN_NOTHING_TO_DO)
status = NV_OK;
return status;
}
NV_STATUS uvm_hmm_migrate_ranges(uvm_va_space_t *va_space,
uvm_va_block_context_t *va_block_context,
NvU64 base,
NvU64 length,
uvm_processor_id_t dest_id,
uvm_migrate_mode_t mode,
uvm_tracker_t *out_tracker)
{
struct mm_struct *mm;
uvm_va_block_t *va_block;
uvm_va_block_retry_t va_block_retry;
NvU64 addr, end, last_address;
NV_STATUS status = NV_OK;
if (!uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
mm = va_block_context->mm;
UVM_ASSERT(mm == va_space->va_space_mm.mm);
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked(&va_space->lock);
last_address = base + length - 1;
for (addr = base; addr < last_address; addr = end + 1) {
struct vm_area_struct *vma;
1 year ago
status = hmm_va_block_find_create(va_space, addr, false, &va_block_context->hmm.vma, &va_block);
2 years ago
if (status != NV_OK)
return status;
end = va_block->end;
if (end > last_address)
end = last_address;
vma = va_block_context->hmm.vma;
if (end > vma->vm_end - 1)
end = vma->vm_end - 1;
status = hmm_migrate_range(va_block,
&va_block_retry,
va_block_context,
dest_id,
addr,
end,
mode,
out_tracker);
if (status != NV_OK)
break;
}
return status;
}
NV_STATUS uvm_hmm_va_block_evict_chunk_prep(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
uvm_gpu_chunk_t *gpu_chunk,
uvm_va_block_region_t chunk_region)
{
uvm_thread_context_t *uvm_context = uvm_thread_context();
unsigned long *src_pfns = va_block_context->hmm.src_pfns;
uvm_gpu_t *gpu = uvm_gpu_chunk_get_gpu(gpu_chunk);
unsigned long pfn = uvm_pmm_gpu_devmem_get_pfn(&gpu->pmm, gpu_chunk);
uvm_page_index_t page_index = chunk_region.first;
int ret;
uvm_assert_mutex_locked(&va_block->lock);
// TODO: Bug 3368756: add support for large GPU pages.
UVM_ASSERT(uvm_va_block_region_num_pages(chunk_region) == 1);
uvm_context->ignore_hmm_invalidate_va_block = va_block;
ret = migrate_device_range(src_pfns + page_index, pfn, uvm_va_block_region_num_pages(chunk_region));
uvm_context->ignore_hmm_invalidate_va_block = NULL;
if (ret)
return errno_to_nv_status(ret);
return NV_OK;
}
// Note that the caller must initialize va_block_context->hmm.src_pfns by
// calling uvm_hmm_va_block_evict_chunk_prep() before calling this.
static NV_STATUS hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
const uvm_page_mask_t *pages_to_evict,
uvm_va_block_region_t region,
uvm_make_resident_cause_t cause,
bool *out_accessed_by_set)
{
NvU64 start = uvm_va_block_region_start(va_block, region);
NvU64 end = uvm_va_block_region_end(va_block, region);
unsigned long *src_pfns = va_block_context->hmm.src_pfns;
unsigned long *dst_pfns = va_block_context->hmm.dst_pfns;
uvm_hmm_migrate_event_t uvm_hmm_migrate_event = {
.va_block = va_block,
.va_block_retry = NULL,
.va_block_context = va_block_context,
.region = region,
.dest_id = UVM_ID_CPU,
.cause = cause,
};
uvm_page_mask_t *page_mask = &uvm_hmm_migrate_event.page_mask;
const uvm_va_policy_t *policy;
uvm_va_policy_node_t *node;
1 year ago
uvm_page_mask_t *cpu_resident_mask = uvm_va_block_resident_mask_get(va_block, UVM_ID_CPU, NUMA_NO_NODE);
2 years ago
unsigned long npages;
NV_STATUS status;
uvm_assert_mutex_locked(&va_block->lock);
if (out_accessed_by_set)
*out_accessed_by_set = false;
// Note that there is no VMA available when evicting HMM pages.
va_block_context->hmm.vma = NULL;
uvm_page_mask_copy(page_mask, pages_to_evict);
uvm_for_each_va_policy_in(policy, va_block, start, end, node, region) {
npages = uvm_va_block_region_num_pages(region);
if (out_accessed_by_set && uvm_processor_mask_get_count(&policy->accessed_by) > 0)
*out_accessed_by_set = true;
// Pages resident on the GPU should not have a resident page in system
// memory.
// TODO: Bug 3660922: Need to handle read duplication at some point.
1 year ago
UVM_ASSERT(uvm_page_mask_region_empty(cpu_resident_mask, region));
2 years ago
status = alloc_and_copy_to_cpu(va_block,
NULL,
src_pfns,
dst_pfns,
region,
page_mask,
NULL,
UVM_ID_INVALID,
NULL);
if (status != NV_OK)
goto err;
status = uvm_va_block_make_resident_copy(va_block,
NULL,
va_block_context,
UVM_ID_CPU,
region,
page_mask,
NULL,
cause);
if (status != NV_OK)
goto err;
migrate_device_pages(src_pfns + region.first, dst_pfns + region.first, npages);
uvm_hmm_migrate_event.region = region;
status = uvm_hmm_migrate_finalize(&uvm_hmm_migrate_event);
if (status != NV_OK)
goto err;
migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
}
return NV_OK;
err:
migrate_device_finalize(src_pfns + region.first, dst_pfns + region.first, npages);
return status;
}
NV_STATUS uvm_hmm_va_block_evict_chunks(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context,
const uvm_page_mask_t *pages_to_evict,
uvm_va_block_region_t region,
bool *out_accessed_by_set)
{
return hmm_va_block_evict_chunks(va_block,
va_block_context,
pages_to_evict,
region,
UVM_MAKE_RESIDENT_CAUSE_EVICTION,
out_accessed_by_set);
}
NV_STATUS uvm_hmm_va_block_evict_pages_from_gpu(uvm_va_block_t *va_block,
uvm_gpu_t *gpu,
uvm_va_block_context_t *va_block_context,
const uvm_page_mask_t *pages_to_evict,
uvm_va_block_region_t region)
{
unsigned long *src_pfns = va_block_context->hmm.src_pfns;
uvm_va_block_gpu_state_t *gpu_state;
uvm_page_index_t page_index;
uvm_gpu_chunk_t *gpu_chunk;
NV_STATUS status;
uvm_assert_mutex_locked(&va_block->lock);
gpu_state = uvm_va_block_gpu_state_get(va_block, gpu->id);
UVM_ASSERT(gpu_state);
UVM_ASSERT(gpu_state->chunks);
// Fill in the src_pfns[] with the ZONE_DEVICE private PFNs of the GPU.
memset(src_pfns, 0, sizeof(va_block_context->hmm.src_pfns));
// TODO: Bug 3368756: add support for large GPU pages.
for_each_va_block_page_in_region_mask(page_index, pages_to_evict, region) {
gpu_chunk = uvm_va_block_lookup_gpu_chunk(va_block,
gpu,
uvm_va_block_cpu_page_address(va_block, page_index));
status = uvm_hmm_va_block_evict_chunk_prep(va_block,
va_block_context,
gpu_chunk,
uvm_va_block_region_for_page(page_index));
if (status != NV_OK)
return status;
}
return hmm_va_block_evict_chunks(va_block,
va_block_context,
pages_to_evict,
region,
UVM_MAKE_RESIDENT_CAUSE_API_MIGRATE,
NULL);
}
1 year ago
NV_STATUS uvm_hmm_remote_cpu_fault(struct vm_fault *vmf)
2 years ago
{
NV_STATUS status = NV_OK;
1 year ago
unsigned long src_pfn;
unsigned long dst_pfn;
struct migrate_vma args;
struct page *src_page = vmf->page;
uvm_tracker_t tracker = UVM_TRACKER_INIT();
2 years ago
int ret;
1 year ago
args.vma = vmf->vma;
args.src = &src_pfn;
args.dst = &dst_pfn;
args.start = nv_page_fault_va(vmf);
args.end = args.start + PAGE_SIZE;
args.pgmap_owner = &g_uvm_global;
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
args.fault_page = src_page;
// We don't call migrate_vma_setup_locked() here because we don't
// have a va_block and don't want to ignore invalidations.
ret = migrate_vma_setup(&args);
UVM_ASSERT(!ret);
2 years ago
if (src_pfn & MIGRATE_PFN_MIGRATE) {
1 year ago
struct page *dst_page;
dst_page = alloc_page(GFP_HIGHUSER_MOVABLE);
2 years ago
if (!dst_page) {
status = NV_ERR_NO_MEMORY;
goto out;
}
lock_page(dst_page);
dst_pfn = migrate_pfn(page_to_pfn(dst_page));
1 year ago
status = uvm_hmm_copy_devmem_page(dst_page, src_page, &tracker);
if (status == NV_OK)
status = uvm_tracker_wait_deinit(&tracker);
2 years ago
}
1 year ago
migrate_vma_pages(&args);
2 years ago
out:
1 year ago
migrate_vma_finalize(&args);
2 years ago
return status;
}
// The routines below are all for UVM-HMM tests.
NV_STATUS uvm_hmm_va_block_range_bounds(uvm_va_space_t *va_space,
struct mm_struct *mm,
NvU64 lookup_address,
NvU64 *startp,
NvU64 *endp,
UVM_TEST_VA_RESIDENCY_INFO_PARAMS *params)
{
struct vm_area_struct *vma;
NvU64 start;
NvU64 end;
if (!uvm_hmm_is_enabled(va_space) || !mm)
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked(&va_space->lock);
// The VMA might have changed while not holding mmap_lock so check it.
vma = find_vma(mm, lookup_address);
if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
return NV_ERR_INVALID_ADDRESS;
// Since managed VA ranges don't cover more than one VMA, return only the
// intersecting range of the VA block and VMA.
start = UVM_VA_BLOCK_ALIGN_DOWN(lookup_address);
end = start + UVM_VA_BLOCK_SIZE - 1;
if (start < vma->vm_start)
start = vma->vm_start;
if (end > vma->vm_end - 1)
end = vma->vm_end - 1;
*startp = start;
*endp = end;
if (params) {
uvm_va_space_processor_uuid(va_space, &params->resident_on[0], UVM_ID_CPU);
params->resident_physical_size[0] = PAGE_SIZE;
params->resident_on_count = 1;
uvm_va_space_processor_uuid(va_space, &params->mapped_on[0], UVM_ID_CPU);
params->mapping_type[0] = (vma->vm_flags & VM_WRITE) ?
UVM_PROT_READ_WRITE_ATOMIC : UVM_PROT_READ_ONLY;
params->page_size[0] = PAGE_SIZE;
params->mapped_on_count = 1;
uvm_va_space_processor_uuid(va_space, &params->populated_on[0], UVM_ID_CPU);
params->populated_on_count = 1;
}
return NV_OK;
}
NV_STATUS uvm_hmm_va_block_update_residency_info(uvm_va_block_t *va_block,
struct mm_struct *mm,
NvU64 lookup_address,
bool populate)
{
uvm_va_space_t *va_space = va_block->hmm.va_space;
struct vm_area_struct *vma;
struct hmm_range range;
uvm_va_block_region_t region;
unsigned long pfn;
NvU64 end;
int ret;
NV_STATUS status;
if (!uvm_hmm_is_enabled(va_space) || !mm)
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked(&va_space->lock);
// The VMA might have changed while not holding mmap_lock so check it.
vma = find_vma(mm, lookup_address);
if (!uvm_hmm_vma_is_valid(vma, lookup_address, false))
return NV_ERR_INVALID_ADDRESS;
end = lookup_address + PAGE_SIZE;
region = uvm_va_block_region_from_start_end(va_block, lookup_address, end - 1);
range.notifier = &va_block->hmm.notifier;
range.start = lookup_address;
range.end = end;
range.hmm_pfns = &pfn;
range.default_flags = 0;
range.pfn_flags_mask = 0;
range.dev_private_owner = &g_uvm_global;
if (populate) {
range.default_flags = HMM_PFN_REQ_FAULT;
if (vma->vm_flags & VM_WRITE)
range.default_flags |= HMM_PFN_REQ_WRITE;
}
1 year ago
uvm_hmm_migrate_begin_wait(va_block);
2 years ago
while (true) {
1 year ago
range.notifier_seq = mmu_interval_read_begin(range.notifier);
2 years ago
ret = hmm_range_fault(&range);
if (ret == -EBUSY)
continue;
1 year ago
if (ret) {
uvm_hmm_migrate_finish(va_block);
2 years ago
return errno_to_nv_status(ret);
1 year ago
}
2 years ago
uvm_mutex_lock(&va_block->lock);
1 year ago
if (!mmu_interval_read_retry(range.notifier, range.notifier_seq))
2 years ago
break;
uvm_mutex_unlock(&va_block->lock);
}
// Update the va_block CPU state based on the snapshot.
// Note that we have to adjust the pfns address since it will be indexed
// by region.first.
1 year ago
status = populate_region(va_block, &pfn - region.first, region, NULL);
2 years ago
uvm_mutex_unlock(&va_block->lock);
1 year ago
uvm_hmm_migrate_finish(va_block);
2 years ago
return NV_OK;
}
NV_STATUS uvm_test_split_invalidate_delay(UVM_TEST_SPLIT_INVALIDATE_DELAY_PARAMS *params, struct file *filp)
{
uvm_va_space_t *va_space = uvm_va_space_get(filp);
atomic64_set(&va_space->test.split_invalidate_delay_us, params->delay_us);
return NV_OK;
}
NV_STATUS uvm_hmm_va_range_info(uvm_va_space_t *va_space,
struct mm_struct *mm,
UVM_TEST_VA_RANGE_INFO_PARAMS *params)
{
uvm_range_tree_node_t *tree_node;
const uvm_va_policy_node_t *node;
2 years ago
struct vm_area_struct *vma;
uvm_va_block_t *va_block;
if (!mm || !uvm_hmm_is_enabled(va_space))
return NV_ERR_INVALID_ADDRESS;
uvm_assert_mmap_lock_locked(mm);
uvm_assert_rwsem_locked(&va_space->lock);
params->type = UVM_TEST_VA_RANGE_TYPE_MANAGED;
params->managed.subtype = UVM_TEST_RANGE_SUBTYPE_HMM;
params->va_range_start = 0;
params->va_range_end = ULONG_MAX;
params->read_duplication = UVM_TEST_READ_DUPLICATION_UNSET;
memset(&params->preferred_location, 0, sizeof(params->preferred_location));
params->accessed_by_count = 0;
params->managed.vma_start = 0;
params->managed.vma_end = 0;
params->managed.is_zombie = NV_FALSE;
params->managed.owned_by_calling_process = (mm == current->mm ? NV_TRUE : NV_FALSE);
vma = find_vma(mm, params->lookup_address);
if (!uvm_hmm_vma_is_valid(vma, params->lookup_address, false))
return NV_ERR_INVALID_ADDRESS;
params->va_range_start = vma->vm_start;
params->va_range_end = vma->vm_end - 1;
params->managed.vma_start = vma->vm_start;
params->managed.vma_end = vma->vm_end - 1;
uvm_mutex_lock(&va_space->hmm.blocks_lock);
tree_node = uvm_range_tree_find(&va_space->hmm.blocks, params->lookup_address);
if (!tree_node) {
UVM_ASSERT(uvm_range_tree_find_hole_in(&va_space->hmm.blocks, params->lookup_address,
&params->va_range_start, &params->va_range_end) == NV_OK);
uvm_mutex_unlock(&va_space->hmm.blocks_lock);
return NV_OK;
}
uvm_mutex_unlock(&va_space->hmm.blocks_lock);
va_block = hmm_va_block_from_node(tree_node);
uvm_mutex_lock(&va_block->lock);
params->va_range_start = va_block->start;
params->va_range_end = va_block->end;
node = uvm_va_policy_node_find(va_block, params->lookup_address);
if (node) {
uvm_processor_id_t processor_id;
if (params->va_range_start < node->node.start)
params->va_range_start = node->node.start;
if (params->va_range_end > node->node.end)
params->va_range_end = node->node.end;
params->read_duplication = node->policy.read_duplication;
if (!UVM_ID_IS_INVALID(node->policy.preferred_location))
uvm_va_space_processor_uuid(va_space, &params->preferred_location, node->policy.preferred_location);
for_each_id_in_mask(processor_id, &node->policy.accessed_by)
uvm_va_space_processor_uuid(va_space, &params->accessed_by[params->accessed_by_count++], processor_id);
}
else {
uvm_range_tree_find_hole_in(&va_block->hmm.va_policy_tree, params->lookup_address,
&params->va_range_start, &params->va_range_end);
}
uvm_mutex_unlock(&va_block->lock);
return NV_OK;
}
// TODO: Bug 3660968: Remove this hack as soon as HMM migration is implemented
// for VMAs other than anonymous private memory.
bool uvm_hmm_must_use_sysmem(uvm_va_block_t *va_block,
uvm_va_block_context_t *va_block_context)
{
struct vm_area_struct *vma = va_block_context->hmm.vma;
uvm_assert_mutex_locked(&va_block->lock);
if (!uvm_va_block_is_hmm(va_block))
return false;
UVM_ASSERT(vma);
UVM_ASSERT(va_block_context->mm == vma->vm_mm);
uvm_assert_mmap_lock_locked(va_block_context->mm);
1 year ago
// TODO: Bug 4050579: Remove this when swap cached pages can be migrated.
if (va_block_context->hmm.swap_cached)
return true;
// migrate_vma_setup() can't migrate VM_SPECIAL so we have to force GPU
// remote mapping.
// TODO: Bug 3660968: add support for file-backed migrations.
// TODO: Bug 3368756: add support for transparent huge page migrations.
return !vma_is_anonymous(vma) ||
(vma->vm_flags & VM_SPECIAL) ||
vma_is_dax(vma) ||
is_vm_hugetlb_page(vma);
2 years ago
}
2 years ago
#endif // UVM_IS_CONFIG_HMM()