Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions .github/workflows/style.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ on:
pull_request:

jobs:
cpplint:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@master
- uses: reviewdog/action-cpplint@master
- uses: actions/checkout@v3
- name: Run clang-format style check for C/C++ programs.
uses: jidicula/clang-format-action@v4.18.0
with:
github_token: ${{ secrets.github_token }}
args: --linelength=120
clang-format-version: '18'
check-path: 'src'
3 changes: 3 additions & 0 deletions src/.clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
BasedOnStyle: Google
IndentWidth: 4
ColumnLimit: 100
127 changes: 61 additions & 66 deletions src/allocator/allocator.c
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,44 +1,42 @@
#include "allocator.h"
#include "include/log_utils.h"

#include "include/libcuda_hook.h"
#include "include/log_utils.h"
#include "multiprocess/multiprocess_memory_limit.h"


size_t BITSIZE = 512;
size_t IPCSIZE = 2097152;
size_t OVERSIZE = 134217728;
//int pidfound;
// int pidfound;

region_list *r_list;
allocated_list *device_overallocated;
allocated_list *device_allocasync;

#define ALIGN 2097152
#define ALIGN 2097152
#define MULTI_PARAM 1

#define CHUNK_SIZE (OVERSIZE/BITSIZE)
#define __CHUNK_SIZE__ CHUNK_SIZE
#define CHUNK_SIZE (OVERSIZE / BITSIZE)
#define __CHUNK_SIZE__ CHUNK_SIZE

extern size_t initial_offset;
extern CUresult
cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, void* data);
extern CUresult cuMemoryAllocate(CUdeviceptr *dptr, size_t bytesize, void *data);
extern CUresult cuMemoryFree(CUdeviceptr dptr);

pthread_once_t allocator_allocate_flag = PTHREAD_ONCE_INIT;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

size_t round_up(size_t size, size_t unit) {
if (size & (unit-1))
return ((size / unit) + 1 ) * unit;
if (size & (unit - 1)) return ((size / unit) + 1) * unit;
return size;
}

int oom_check(const int dev, size_t addon) {
CUdevice d;
if (dev==-1)
if (dev == -1)
cuCtxGetDevice(&d);
else
d=dev;
d = dev;
uint64_t limit = get_current_device_memory_limit(d);
size_t _usage = get_gpu_memory_usage(d);

Expand All @@ -47,12 +45,11 @@ int oom_check(const int dev, size_t addon) {
}

size_t new_allocated = _usage + addon;
LOG_INFO("_usage=%lu limit=%lu new_allocated=%lu",_usage,limit,new_allocated);
LOG_INFO("_usage=%lu limit=%lu new_allocated=%lu", _usage, limit, new_allocated);
if (new_allocated > limit) {
LOG_ERROR("Device %d OOM %lu / %lu", d, new_allocated, limit);

if (clear_proc_slot_nolock(1) > 0)
return oom_check(dev,addon);
if (clear_proc_slot_nolock(1) > 0) return oom_check(dev, addon);
return 1;
}
return 0;
Expand All @@ -61,27 +58,27 @@ int oom_check(const int dev, size_t addon) {
CUresult view_vgpu_allocator() {
allocated_list_entry *al;
size_t total;
total=0;
total = 0;
LOG_INFO("[view1]:overallocated:");
for (al=device_overallocated->head;al!=NULL;al=al->next){
LOG_INFO("(%p %lu)\t",(void *)al->entry->address,al->entry->length);
total+=al->entry->length;
for (al = device_overallocated->head; al != NULL; al = al->next) {
LOG_INFO("(%p %lu)\t", (void *)al->entry->address, al->entry->length);
total += al->entry->length;
}
LOG_INFO("total=%lu",total);
LOG_INFO("total=%lu", total);
size_t t = get_current_device_memory_usage(0);
LOG_INFO("current_device_memory_usage:%lu",t);
LOG_INFO("current_device_memory_usage:%lu", t);
return 0;
}

CUresult get_listsize(allocated_list *al, size_t *size) {
if (al->length == 0){
if (al->length == 0) {
*size = 0;
return CUDA_SUCCESS;
}
size_t count=0;
size_t count = 0;
allocated_list_entry *val;
for (val=al->head;val!=NULL;val=val->next){
count+=val->entry->length;
for (val = al->head; val != NULL; val = val->next) {
count += val->entry->length;
}
*size = count;
return CUDA_SUCCESS;
Expand All @@ -92,10 +89,10 @@ void allocator_init() {

device_overallocated = malloc(sizeof(allocated_list));
LIST_INIT(device_overallocated);
device_allocasync=malloc(sizeof(allocated_list));
device_allocasync = malloc(sizeof(allocated_list));
LIST_INIT(device_allocasync);

pthread_mutex_init(&mutex,NULL);
pthread_mutex_init(&mutex, NULL);
}

int add_chunk(CUdeviceptr *address, size_t size) {
Expand All @@ -105,8 +102,7 @@ int add_chunk(CUdeviceptr *address, size_t size) {
cuCtxGetDevice(&dev);

/* OOM pre-check without lock */
if (oom_check(dev, size))
return CUDA_ERROR_OUT_OF_MEMORY;
if (oom_check(dev, size)) return CUDA_ERROR_OUT_OF_MEMORY;

/* GPU allocation outside lock — the expensive part */
if (size <= IPCSIZE) {
Expand Down Expand Up @@ -141,17 +137,17 @@ int add_chunk(CUdeviceptr *address, size_t size) {

int add_chunk_only(CUdeviceptr address, size_t size, CUdevice dev) {
pthread_mutex_lock(&mutex);
size_t addr=0;
size_t addr = 0;
size_t allocsize;
if (oom_check(dev,size)){
if (oom_check(dev, size)) {
pthread_mutex_unlock(&mutex);
return -1;
}
allocated_list_entry *e;
INIT_ALLOCATED_LIST_ENTRY(e, addr, size, dev);
LIST_ADD(device_overallocated,e);
//uint64_t t_size;
e->entry->address=address;
LIST_ADD(device_overallocated, e);
// uint64_t t_size;
e->entry->address = address;
allocsize = size;
add_gpu_device_memory_usage(getpid(), dev, allocsize, 2);
pthread_mutex_unlock(&mutex);
Expand All @@ -161,8 +157,9 @@ int add_chunk_only(CUdeviceptr address, size_t size, CUdevice dev) {
int check_memory_type(CUdeviceptr address) {
allocated_list_entry *cursor;
cursor = device_overallocated->head;
for (cursor=device_overallocated->head;cursor!=NULL;cursor=cursor->next){
if ((cursor->entry->address <= address) && (cursor->entry->address+cursor->entry->length>=address))
for (cursor = device_overallocated->head; cursor != NULL; cursor = cursor->next) {
if ((cursor->entry->address <= address) &&
(cursor->entry->address + cursor->entry->length >= address))
return CU_MEMORYTYPE_DEVICE;
}
return CU_MEMORYTYPE_HOST;
Expand Down Expand Up @@ -218,30 +215,25 @@ int remove_chunk_only(CUdeviceptr dptr) {
return -1;
}

int allocate_raw(CUdeviceptr *dptr, size_t size) {
return add_chunk(dptr, size);
}
int allocate_raw(CUdeviceptr *dptr, size_t size) { return add_chunk(dptr, size); }

int free_raw(CUdeviceptr dptr) {
return remove_chunk(device_overallocated, dptr);
}
int free_raw(CUdeviceptr dptr) { return remove_chunk(device_overallocated, dptr); }

int remove_chunk_async(
allocated_list *a_list, CUdeviceptr dptr, CUstream hStream) {
int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStream) {
size_t t_size;
if (a_list->length == 0) {
return -1;
}
allocated_list_entry *val;
for (val = a_list->head; val != NULL; val = val->next) {
if (val->entry->address == dptr) {
t_size=val->entry->length;
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemFreeAsync,dptr,hStream);
LIST_REMOVE(a_list,val);
a_list->limit-=t_size;
t_size = val->entry->length;
CUDA_OVERRIDE_CALL(cuda_library_entry, cuMemFreeAsync, dptr, hStream);
LIST_REMOVE(a_list, val);
a_list->limit -= t_size;
CUdevice dev;
cuCtxGetDevice(&dev);
rm_gpu_device_memory_usage(getpid(),dev,t_size,2);
rm_gpu_device_memory_usage(getpid(), dev, t_size, 2);
return 0;
}
}
Expand All @@ -256,53 +248,56 @@ int free_raw_async(CUdeviceptr dptr, CUstream hStream) {
}

int add_chunk_async(CUdeviceptr *address, size_t size, CUstream hStream) {
size_t addr=0;
size_t addr = 0;
size_t allocsize;
CUresult res = CUDA_SUCCESS;
CUdevice dev;
cuCtxGetDevice(&dev);
if (oom_check(dev,size))
return -1;
if (oom_check(dev, size)) return -1;

allocated_list_entry *e;
INIT_ALLOCATED_LIST_ENTRY(e, addr, size, dev);
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAllocAsync,&e->entry->address,size,hStream);
res =
CUDA_OVERRIDE_CALL(cuda_library_entry, cuMemAllocAsync, &e->entry->address, size, hStream);
if (res != CUDA_SUCCESS) {
LOG_ERROR("cuMemoryAllocate failed res=%d",res);
LOG_ERROR("cuMemoryAllocate failed res=%d", res);
return res;
}
*address = e->entry->address;
CUmemoryPool pool;
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
res = CUDA_OVERRIDE_CALL(cuda_library_entry, cuDeviceGetMemPool, &pool, dev);
if (res != CUDA_SUCCESS) {
LOG_ERROR("cuDeviceGetMemPool failed res=%d",res);
LOG_ERROR("cuDeviceGetMemPool failed res=%d", res);
return res;
}
size_t poollimit;
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,&poollimit);
res = CUDA_OVERRIDE_CALL(cuda_library_entry, cuMemPoolGetAttribute, pool,
CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, &poollimit);
if (res != CUDA_SUCCESS) {
LOG_ERROR("cuMemPoolGetAttribute failed res=%d",res);
LOG_ERROR("cuMemPoolGetAttribute failed res=%d", res);
return res;
}
if (poollimit != 0) {
if (poollimit> device_allocasync->limit) {
allocsize = (poollimit-device_allocasync->limit < size)? poollimit-device_allocasync->limit : size;
if (poollimit > device_allocasync->limit) {
allocsize = (poollimit - device_allocasync->limit < size)
? poollimit - device_allocasync->limit
: size;
cuCtxGetDevice(&dev);
add_gpu_device_memory_usage(getpid(), dev, allocsize, 2);
device_allocasync->limit=device_allocasync->limit+allocsize;
e->entry->length=allocsize;
}else{
e->entry->length=0;
device_allocasync->limit = device_allocasync->limit + allocsize;
e->entry->length = allocsize;
} else {
e->entry->length = 0;
}
}
LIST_ADD(device_allocasync,e);
LIST_ADD(device_allocasync, e);
return 0;
}

int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream) {
int tmp;
pthread_mutex_lock(&mutex);
tmp = add_chunk_async(dptr,size,hStream);
tmp = add_chunk_async(dptr, size, hStream);
pthread_mutex_unlock(&mutex);
return tmp;
}
Loading
Loading