-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Description
Hi, when I am running modified exp_inverse.py example to fold a cloth, it seems that there is memory leakage, for each epoch when I check the memory using htop, the memory of exp_inverse.py is always increasing. And the process will be automatically killed if the epoch is long. Here is our code
import torch
import arcsim
import gc
import time
import json
import sys
import gc
import os
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
now = datetime.now()
timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
steps = 30
epochs= 10
node_number = 0
handles = [25, 60, 30, 54]
losses = []
param_g = torch.zeros([steps, 12],dtype=torch.float64, requires_grad=True)
default_dir = 'results/'+time.ctime()
os.mkdir(default_dir)
out_path = default_dir + '/default_out'
os.mkdir(out_path)
with open('conf/rigidcloth/drag/drag.json','r') as f:
config = json.load(f)
def save_config(config, file):
with open(file,'w') as f:
json.dump(config, f)
save_config(config, out_path+'/conf.json')
torch.set_num_threads(16)
scalev=1
def reset_sim(sim, epoch):
if epoch < 20:
arcsim.init_physics(out_path+'/conf.json', out_path+'/out%d'%epoch,False)
else:
arcsim.init_physics(out_path+'/conf.json',out_path+'/out',False)
def get_target_mesh():
sim = arcsim.get_sim()
arcsim.init_physics('conf/rigidcloth/fold_targets/target1.json',out_path+'/target',False)
global node_number
node_number = len(sim.cloths[0].mesh.nodes)
ref = [sim.cloths[0].mesh.nodes[i].x.numpy() for i in range(node_number)]
ref = torch.from_numpy(np.vstack(ref))
return ref
def get_loss(sim,ref):
reg = torch.norm(param_g, p=2)*0.001
loss = 0
for i in range(ref.shape[0]):
loss += torch.norm(ref[i]-sim.cloths[0].mesh.nodes[i].x)**2
loss /= node_number
loss += reg
return loss
def run_sim(steps,sim,ref):
# sim.obstacles[2].curr_state_mesh.dummy_node.x = param_g[1]
print("step")
for step in range(steps):
print(step)
for i in range(len(handles)):
inc_v = param_g[step,3*i:3*i+3]
sim.cloths[0].mesh.nodes[handles[i]].v += inc_v
del inc_v
arcsim.sim_step()
loss = get_loss(sim,ref)
return loss
@profile
def do_train(cur_step,optimizer,scheduler,sim):
epoch = 0
ref = get_target_mesh()
print(ref)
while True:
reset_sim(sim, epoch)
st = time.time()
loss = run_sim(steps, sim,ref)
en0 = time.time()
optimizer.zero_grad()
loss.backward()
en1 = time.time()
print("=======================================")
f.write('epoch {}: loss={} \n'.format(epoch, loss.data))
print('epoch {}: loss={} \n'.format(epoch, loss.data))
print('forward time={}'.format(en0-st))
print('backward time={}'.format(en1-en0))
optimizer.step()
#scheduler.step(epoch)
losses.append(loss)
if epoch>=epochs:
break
epoch = epoch + 1
# break
def visualize_loss(losses,dir_name):
plt.plot(losses)
plt.title('losses')
plt.xlabel('epochs')
plt.ylabel('losses')
plt.savefig(dir_name+'/'+'loss.jpg')
with open(out_path+('/log%s.txt'%timestamp),'w',buffering=1) as f:
tot_step = 1
sim=arcsim.get_sim()
# reset_sim(sim)
lr = 10
momentum = 0.4
f.write('lr={} momentum={}\n'.format(lr,momentum))
optimizer = torch.optim.SGD([{'params':param_g,'lr':lr}],momentum=momentum)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,10,2,eta_min=0.0001)
for cur_step in range(tot_step):
do_train(cur_step,optimizer,scheduler,sim)
visualize_loss(losses,default_dir)
print("done")Also, I used memory profiler to inspect the code and find that arcsim.step() takes most memory without releasing them.
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels