model: lingbot_map
env: lingbot-map
_checkpoint: /home/jcy/workspace/models/lingbot-map-long.pt
_device: cuda
_mode: windowed
_use_amp: true
_use_sdpa: false # Force FlashInfer backend (paged KV cache attention)
_image_size: 518
_patch_size: 14
_enable_3d_rope: true
_num_scale_frames: 2
_max_frame_num: 1024
_kv_cache_sliding_window: 48
_kv_cache_scale_frames: 2
_keyframe_interval: auto # auto = 1 if frames<=320 else ceil(N/320); or a fixed int
_auto_keyframe_threshold: 320 # frame-count threshold driving the 'auto' keyframe_interval (streaming mode)
_area_budget: 255000
_align: 14
{
"lingbot_map": {
"ate": 2.0922960340163272,
"num_scenes": 1,
"rpe_rot": 0.8845998878433984,
"rpe_trans": 0.1718545140124919
}
}
{
"lingbot_map": {
"AUC_03": 11.648380355276906,
"AUC_05": 21.477664576802507,
"AUC_15": 59.774817136886114,
"AUC_30": 78.31432863113898,
"Racc_03": 65.92868338557993,
"Racc_05": 84.53173981191222,
"Racc_15": 100.0,
"Racc_30": 100.0,
"Tacc_03": 22.71551724137931,
"Tacc_05": 43.256269592476485,
"Tacc_15": 92.4960815047022,
"Tacc_30": 98.52664576802508,
"num_scenes": 1
}
}
I resolved the discrepancies between lingbot-map.py and predict_long.py (ureeey@6226ed3), then ran the benchmark with the following results.Is this result within the normal range?
lingbot-map.yaml
Results