From 5178565802c5ef753cffc967a8ca7b0b6f33ad08 Mon Sep 17 00:00:00 2001
From: HenryLiukkk <1296629304@qq.com>
Date: Tue, 11 Nov 2025 21:07:58 +0800
Subject: [PATCH] feat: add ALOHA robot implementation and evaluation scripts

---
 README.md                                     | 590 +-----------------
 .../__pycache__/aloha_utils.cpython-310.pyc   | Bin 0 -> 2740 bytes
 .../__pycache__/constants.cpython-310.pyc     | Bin 0 -> 3271 bytes
 .../__pycache__/real_env.cpython-310.pyc      | Bin 0 -> 7934 bytes
 .../__pycache__/robot_utils.cpython-310.pyc   | Bin 0 -> 7982 bytes
 experiments/robot/aloha/aloha_utils.py        |  85 +++
 experiments/robot/aloha/constants.py          | 128 ++++
 ...EVAL-LOCAL-openvla-2025_11_08-15_14_50.txt |   7 +
 ...EVAL-LOCAL-openvla-2025_11_08-15_18_25.txt |   1 +
 ...EVAL-LOCAL-openvla-2025_11_08-15_20_19.txt |   7 +
 ...EVAL-LOCAL-openvla-2025_11_08-15_23_06.txt |   5 +
 ...EVAL-LOCAL-openvla-2025_11_08-15_27_04.txt |   1 +
 .../aloha/preprocess_split_aloha_data.py      | 260 ++++++++
 experiments/robot/aloha/real_env.py           | 242 +++++++
 .../robot/aloha/requirements_aloha.txt        |  26 +
 experiments/robot/aloha/robot_utils.py        | 187 ++++++
 experiments/robot/aloha/run_aloha_eval.py     | 449 +++++++++++++
 prismatic/vla/constants.py                    |   4 +-
 prismatic/vla/datasets/datasets.py            |   2 +-
 prismatic/vla/datasets/rlds/oxe/configs.py    |   9 +
 .../vla/datasets/rlds/oxe/materialize.py      |   7 +-
 21 files changed, 1439 insertions(+), 571 deletions(-)
 create mode 100644 experiments/robot/aloha/__pycache__/aloha_utils.cpython-310.pyc
 create mode 100644 experiments/robot/aloha/__pycache__/constants.cpython-310.pyc
 create mode 100644 experiments/robot/aloha/__pycache__/real_env.cpython-310.pyc
 create mode 100644 experiments/robot/aloha/__pycache__/robot_utils.cpython-310.pyc
 create mode 100644 experiments/robot/aloha/aloha_utils.py
 create mode 100755 experiments/robot/aloha/constants.py
 create mode 100644 experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_14_50.txt
 create mode 100644 experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_18_25.txt
 create mode 100644 experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_20_19.txt
 create mode 100644 experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_23_06.txt
 create mode 100644 experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_27_04.txt
 create mode 100644 experiments/robot/aloha/preprocess_split_aloha_data.py
 create mode 100644 experiments/robot/aloha/real_env.py
 create mode 100644 experiments/robot/aloha/requirements_aloha.txt
 create mode 100644 experiments/robot/aloha/robot_utils.py
 create mode 100644 experiments/robot/aloha/run_aloha_eval.py
diff --git a/README.md b/README.md
index 458b012..dfa9da7 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,5 @@
-<div align="center">
-  <img src="figure/LOGO2.png" width="70%" style="vertical-align:-7px;" />
 
-
-[![Paper](https://img.shields.io/badge/Paper-A42C25?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2509.09372) [![Hugging Face Collection](https://img.shields.io/badge/Models-fcd022?style=for-the-badge&logo=huggingface&logoColor=white)](https://huggingface.co/VLA-Adapter) [![Twitter](https://img.shields.io/badge/AK-%23000000.svg?style=for-the-badge&logo=x&logoColor=white)](https://x.com/_akhaliq/status/1966610780838621241) [![WeChat](https://img.shields.io/badge/WeChat--Group-07C160?style=for-the-badge&logo=wechat&logoColor=white)](https://github.com/OpenHelix-Team/VLA-Adapter/issues/1)
-
-</div>
-
-### The official implementation of **VLA-Adapter**.
-<br/>
-
-<div id="top" align="center">
-<p align="center">
-<img src=figure/Framework.png width=90% />
-</p>
-</div>
-
-> **📝 Paper: https://arxiv.org/abs/2509.09372**<br/>
-> **🌍 Project page: https://vla-adapter.github.io/**<br/>
-> **🤗 HuggingFace: https://huggingface.co/VLA-Adapter**<br/>
-> **Github: https://github.com/OpenHelix-Team/VLA-Adapter**
-
-<br/>
-
-## :loudspeaker: News!
-- **[2025/09/22]** We released our codes! An enhanced **Pro** version is also released (this version conforms to the pipeline in the original paper, but is optimized in implementation). Everyone is welcome to use it!🎉
-- **[2025/09/13]** Our paper won the 🥇**first place** in the [daily list](https://huggingface.co/papers/date/2025-09-12), the 🥈**second place** in the [weekly list](https://huggingface.co/papers/week/2025-W37), and 🥉**third place** in the [Monthly list](https://huggingface.co/papers/month/2025-09) in HF! ⭐
-- **[2025/09/13]** Our paper listed in the [Trending Paper](https://huggingface.co/papers/trending) in HF! ⭐
-- **[2025/09/12]** We released the original version of the VLA-Adapter for four LIBERO models on [HuggingFace](https://huggingface.co/VLA-Adapter).
-- **[2025/09/11]** We released our paper on [ArXiv](https://arxiv.org/abs/2509.09372).
-
-<br/>
-
-## :black_nib: TODO List<a name="todo"></a>
-
-- [x]  Release **checkpoints** for reproduction.
-- [x]  Release [VLA-Adapter v2 paper](https://arxiv.org/abs/2509.09372).
-- [ ]  A more **powerful version**, **VLA-Adapter++**, and a detailed **technical report** 📝 will be released soon.<br/>
-- [ ]  Continue to update the code to adapt to various **real-world systems** deployments, including the configuration of our paper, Franka, UR-5, and AGILE Piper.<br/>
-- [ ]  It will soon be compatible with **various foundation models**, including but not limited to [VPP](https://arxiv.org/abs/2412.14803), [π0.5](https://arxiv.org/abs/2504.16054).<br/>
-- [ ]  We will update the **diffusion transformers** and **flow matching** policy networks in the future, and the results will be updated in the subsequent VLA-Adapter++ technical report.
-- [ ]  We will also update and give more experiments on **Frozen backbone**.
-- [ ]  We will expand its **generalization** further in the future. Work is in progress! So please stay tuned!
-- [ ]  **RL post-training** is also in progress. Interested researchers are welcome to join us in building this foundation!
-- [ ]  **The dual-system compatibility** of VLA-Adapter is under exploration!
-
-
-<br/>
-
-## 🌟 Table of Contents
-
-- [:rocket: Quick Start](#rocket-quick-start) 
-  - [Conda Environment of VLA-Adapter](#conda-environment-of-vla-adapter)
-  - [Install Dependencies](#install-dependencies)
-- [:pencil: Data Preparation](#pencil-data-preparation) 
-  - [LIBERO Benchmark](#libero-benchmark)
-  - [CALVIN Benchmark](#calvin-benchmark)
-  - [:video_game: Our Dependencies](#video_game-our-dependencies)
-  - [:pushpin: Benchmark Location](#pushpin-benchmark-location)
-- [⚓ VLM backbone](#vlm)
-- [:fire: Training for Different Configurations](#fire-training-for-different-configurations) &emsp; => Provides **training configurations** for GPUs ranging from **10GB** to **80GB** of VRAM.
-  - [:books: Related File for Training](#books-related-file-for-training)
-  - [:ledger: How to Train on Extremely Limited VRAM GPUs](#ledger-how-to-train-on-extremely-limited-vram-gpus) &emsp; => A card with 10GB-12GB *(e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070)*
-  - [:ledger: How to Train on Low VRAM GPUs](#ledger-how-to-train-on-low-vram-gpus) &emsp; => A card with 24GB *(e.g. NVIDIA GeForce RTX 3090 and 4090)*
-  - [:ledger: How to Train on Larger VRAM GPUs](#ledger-how-to-train-on-larger-vram-gpus) &emsp; => A Consumer GPU with 32GB *(e.g. NVIDIA GeForce RTX 5090)* &emsp; A Professional-Grade GPU with 40GB-48GB *(e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).*
-  - [:ledger: How to Train on Sufficient VRAM GPUs](#ledger-how-to-train-on-sufficient-vram-gpus) &emsp; => Professional-Grade GPUs with ≥80GB *(e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).*
-- [:mechanical_arm: Inference](#mechanical_arm-inference)
-  - [:books: Related File for Inference](#books-related-file-for-inference)
-  - [🤗 Checkpoint of VLA-Adapter](#ckpts)
-  - [:notebook: How to Eval](#evals)
-- [🌈 Success Rate Comparison](#results)
-- [📝 Citation](#cite)
-- [:heart: Acknowledgment](#heart-acknowledgment)
-
-<br/>
+### The  real-world ALOHA system implementation of **VLA-Adapter**.
 
 ## :rocket: Quick Start
 
@@ -110,73 +37,15 @@ pip install "flash-attn==2.5.5" --no-build-isolation
 ```
 
 <br/>
-<br/>
 
 
 ## :pencil: Data Preparation
 
-### LIBERO Benchmark
-
-- **(Optional)**
-
-Clone and install the [LIBERO repo](https://github.com/Lifelong-Robot-Learning/LIBERO) and required packages:
-
-```bash
-git clone https://github.com/Lifelong-Robot-Learning/LIBERO.git
-pip install -e LIBERO
-pip install -r experiments/robot/libero/libero_requirements.txt  # From vla-adapter base dir
-```
-
-To download the [LIBERO datasets](https://huggingface.co/datasets/openvla/modified_libero_rlds) that we used in our fine-tuning experiments, run the command below. This will download the `Spatial`, `Object`, `Goal`, and `Long` datasets in `RLDS` format, i.e., `libero_spatial_no_noops`, `libero_object_no_noops`, `libero_goal_no_noops`, `libero_10_no_noops`. (`"_no_noops"` stands for no no-op actions, i.e., training samples with near-zero actions are filtered out). These datasets require `~10GB` of memory in total. If needed, see details on how to download the original non-RLDS datasets [here](https://github.com/openvla/openvla?tab=readme-ov-file#libero-setup). You can use these to fine-tune Prismatic-VLMs (built on Qwen2.5-0.5B) or other VLMs.
-
-```bash
-git clone git@hf.co:datasets/openvla/modified_libero_rlds
-```
-
-🌟 Attention! The dataset downloaded in this way needs to remove of the ``modified_`` word to adapt to the path of - [:pushpin: Benchmark Location](#pushpin-benchmark-location)!!!
-
-When using LIBERO, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use:
-
-```bash
-sudo apt-get update
-sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev
-```
-
-### CALVIN Benchmark
-
-- **(Optional)**
-
-```bash
-git clone --recurse-submodules https://github.com/mees/calvin.git
-export CALVIN_ROOT=$(pwd)/calvin
-cd $CALVIN_ROOT
-
-# Installation of `pyhash` may fail on some machines. If it fails, you can solve it by lowering the `setuptools` version: `pip install setuptools==57.5.0`
-sh install.sh
-```
-
-To download the [CALVIN ABC→D datasets](https://github.com/mees/calvin/tree/main/dataset) that we used in our fine-tuning experiments, run the command below. 
-
-```bash
-cd $CALVIN_ROOT/dataset
-sh download_data.sh ABC
-```
-
-If you want to download the RLDS format, you can visit [here](https://huggingface.co/datasets/zhouhongyi/calvin_abc_rlds) to download it. This dataset requires `~50GB` of memory.
-
-When using CALVIN, you may get an error message like `AttributeError: 'NoneType' object has no attribute 'eglQueryString'`. You can use:
-
-```bash
-sudo apt-get update
-sudo apt-get install libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev libglew-dev
-```
-
-
-### :video_game: Our Dependencies 
+### ALOHA Benchmark
+If needed, see details on how to record the real HDF5 datasets [here](https://github.com/tonyzhaozh/aloha).
 
-- **(including LIBERO and CALVIN)**
+After data collection is complete, you need to convert the HDF5 format data to RLDS format. Specific methods can be found [here](https://github.com/HenryLiukkk/rlds_dataset_builder).
 
-At this point, the environment is fully installed. If you want to confirm whether the environment is correct, you can see the `our_envs.txt` file we released.
 
 
 ### :pushpin: Benchmark Location
@@ -186,23 +55,13 @@ The downloaded dataset can be placed in the `/data` folder. The overall director
 ```
 ·
 ├── data
-·   ├── libero
-    │   ├── libero_10_no_noops
-    │   │   └── 1.0.0  (It contains some json files and 32 tfrecord files)
-    │   ├── libero_goal_no_noops
-    │   │   └── 1.0.0  (It contains some json files and 16 tfrecord files)
-    │   ├── libero_object_no_noops
-    │   │   └── 1.0.0  (It contains some json files and 32 tfrecord files)
-    │   ├── libero_spatial_no_noops
-    │   │   └── 1.0.0  (It contains some json files and 16 tfrecord files)
-    │
-    ├── calvin_abc
-    │   └── 1.0.0  (It contains some json files, 512 train tfrecord files, and 32 valid tfrecord files)
+·   ├── aloha
+    │   └── aloha_put_x_into_the_box_80_demos
+    │       └── 1.0.0  (It contains some json files and 32 tfrecord files)
     │
     └── other benchmarks ...
 ```
 
-<br/>
 <br/>
 
 ## ⚓ VLM backbone <a name="vlm"></a>
@@ -226,63 +85,6 @@ We use the `Prismatic-VLMs` architecture. Since the file is large, please downlo
 ### :books: Related File for Training
 * `vla-scripts/finetune.py`: VLA fine-tuning script
 
-
-### :ledger: How to Train on Extremely Limited VRAM GPUs
-
-***=> Extremely Limited VRAM (A card with 10GB-12GB) (e.g. NVIDIA GeForce RTX 2080Ti, 3060, 3080, 4070, 4080, and 5070).***
-
->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.***
-
-If your resources are extremely limited, you can set `--batch_size 1` and `--lora_rank 64`, it only requires `9.6GB` of VRAM. Certainly, `batch size = 1` will cause gradient updates to be greatly affected by extreme values, and loss convergence will be unstable. In this case, you can modify the `grad_accumulation_steps` parameter to simulate a similar effect. For example, `--batch_size 1` with `--grad_accumulation_steps 8` has a similar effect to `--batch_size 8`, but the training speed will be slower. This means that you can't use the [OpenVLA-OFT](https://github.com/moojink/openvla-oft) model on a card with `10GB` because even with `batch size = 1`, it requires `25GB` of VRAM. Fortunately, you can use VLA-Adapter. However, the `batch size` is still small, you can increase `--max_steps` to achieve the performance reported in the paper.
-
->***About `vlm_path`.***
-
-The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`.
-
->***About `data_name`.***
-
-Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`.
-
->***About `use_pro_version`.***
-
-In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version`, requiring only `8.6GB` of VRAM. You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`.
-
- ```bash
-data_name=libero_spatial_no_noops
-
-CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \
---config_file_path pretrained_models/configs \
---data_root_dir data/libero \
---dataset_name $data_name \
---run_root_dir outputs \
---use_film False \
---num_images_in_input 2 \
---use_proprio True \
---use_lora True \
---use_fz False \
---use_minivlm True \
---image_aug True \
---num_steps_before_decay 400000 \
---max_steps 400005 \
---save_freq 5000 \
---save_latest_checkpoint_only False \
---merge_lora_during_training True \
---batch_size 1 \
---grad_accumulation_steps 8 \
---learning_rate 2e-4 \
---lora_rank 64 \
---use_pro_version True \
---wandb_entity "YOUR_WANDB_ENTITY" \
---wandb_project "$data_name" \
---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \
-> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 &
-```
-
-Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference.
-
-<br/>
-
 ### :ledger: How to Train on Low VRAM GPUs
 
 ***=> Low VRAM (A card with 24GB) (e.g. NVIDIA GeForce RTX 3090 and 4090).***
@@ -305,12 +107,12 @@ In addition, we recently released an enhanced version `Pro` of the VLA-Adapter.
 
 
  ```bash
-data_name=libero_spatial_no_noops
+data_name=aloha_put_x_into_the_box_80_demos
 
 CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
 --vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \
 --config_file_path pretrained_models/configs \
---data_root_dir data/libero \
+--data_root_dir data/aloha \
 --dataset_name $data_name \
 --run_root_dir outputs \
 --use_film False \
@@ -320,395 +122,51 @@ CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-s
 --use_fz False \
 --use_minivlm True \
 --image_aug True \
---num_steps_before_decay 200000 \
---max_steps 200005 \
+--num_steps_before_decay 50000 \
+--max_steps 100005 \
 --save_freq 5000 \
 --save_latest_checkpoint_only False \
 --merge_lora_during_training True \
 --batch_size 4 \
 --grad_accumulation_steps 4 \
---learning_rate 2e-4 \
+--learning_rate 1e-4 \
 --lora_rank 64 \
 --use_pro_version True \
 --wandb_entity "YOUR_WANDB_ENTITY" \
 --wandb_project "$data_name" \
---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \
-> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 &
-```
-
-Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference.
-
-
+--run_id_note VLA-Adapter--aloha_put_x_into_the_box_80_demos--$current_time \
+> logs/VLA-Adapter--aloha_put_x_into_the_box_80_demos--$current_time.log 2>&1 &
 
-<br/>
-
-### :ledger: How to Train on Larger VRAM GPUs
-
-***=> A Consumer GPU with 32GB (e.g. NVIDIA GeForce RTX 5090) <br/> => A Professional-Grade GPU with 40GB-48GB (e.g. NVIDIA A100-40GB, A800-40GB, L20, and RTX A6000).***
-
-
->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.***
-
-If you have such a device, you can increase the `batch size` and `lora rank`: `--batch_size 8` and `--lora_rank 64`. This only takes nearly `29GB`. 
-
->***About `vlm_path`.***
-
-The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`.
-
->***About `data_name`.***
-
-Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`.
-
-With this configuration, you can achieve the same results as in our paper on the `LIBERO-Object` benchmark, achieving a `99.2%` success rate, in just `8 hours`. The `LIBERO-Spatial` benchmark requires approximately 10 hours of training. However, the `LIBERO-Long` benchmark takes longer because its tasks are longer and more difficult, requiring more training steps to achieve superior performance.
-
->***About `use_pro_version`.***
-
-In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`.
-
- ```bash
-data_name=libero_spatial_no_noops
-
-CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \
---config_file_path pretrained_models/configs \
---data_root_dir data/libero \
---dataset_name $data_name \
---run_root_dir outputs \
---use_film False \
---num_images_in_input 2 \
---use_proprio True \
---use_lora True \
---use_fz False \
---use_minivlm True \
---image_aug True \
---num_steps_before_decay 200000 \
---max_steps 200005 \
---save_freq 5000 \
---save_latest_checkpoint_only False \
---merge_lora_during_training True \
---batch_size 8 \
---grad_accumulation_steps 2 \
---learning_rate 2e-4 \
---lora_rank 64 \
---use_pro_version True \
---wandb_entity "YOUR_WANDB_ENTITY" \
---wandb_project "$data_name" \
---run_id_note VLA-Adapter--libero_spatial_no_noops--$current_time \
-> logs/VLA-Adapter--libero_spatial_no_noops--$current_time.log 2>&1 &
 ```
-
 Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference.
 
-
+For other GPU configuration solutions, please refer to [VLA-Adapter](https://github.com/OpenHelix-Team/VLA-Adapter).
 
 <br/>
 
-### :ledger: How to Train on Sufficient VRAM GPUs
-
-***=> Professional-Grade GPUs with ≥80GB (e.g. NVIDIA A100-80GB, A800-80GB, H100, H800, H20-NVLink, and GB200).***
-
->***About `batch_size`, `lora_rank`, `grad_accumulation_steps`, and `max_steps`.***
-
-You can use 1 to 8 GPUs for training by changing the number of `CUDA_VISIBLE_DEVICES` to the GPU number and the number of GPUs after `--nproc-per-node`. In our paper, we use 4×H100 GPU for training. In this configuration, the four suites of the LIBERO benchmark, `Spatial` (only five hours), `Object` (less than one hour), `Goal` (three hours), and `Long` (half a day); the `CALVIN` benchmark (eight hours)
-
->***About `vlm_path`.***
-
-The VLM in the VLA-Adapter uses the Prismatic-VLMs architecture, with the LLM backbone being `Qwen2.5-0.5B`. You can download it from https://huggingface.co/Stanford-ILIAD/prism-qwen25-extra-dinosiglip-224px-0_5b and place it in `/pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b`.
-
->***About `data_name`.***
-
-Launch the fine-tuning script with the vla-adapter configuration below. It can run in the background, and the running progress can be seen in the `/logs` folder. You can replace `libero_spatial_no_noops` with `libero_object_no_noops`, `libero_goal_no_noops`, or `libero_10_no_noops`. If you are using the `CALVIN` benchmark, you need to delete `\libero` in `--data_root_dir` and replace `libero_spatial_no_noops` with `calvin_abc`.
-
-
->***About `use_pro_version`.***
-
-In addition, we recently released an enhanced version `Pro` of the VLA-Adapter. While its framework remains consistent with the original paper, it has been enhanced in the implementation, resulting in significantly improved performance. **Therefore, we strongly recommend using the Pro version!** The `Pro` version's `Policy` size is `207MB`, and training speed is virtually unchanged. The `original version` is nearly `1GB` smaller than the `pro version` (1 batch). You can choose whether to use the `Pro` version by setting the `use_pro_version` parameter, i.e., the `Pro` version is `--use_pro_version True`.
-
-```bash
-data_name=libero_spatial_no_noops
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
---vlm_path pretrained_models/prism-qwen25-extra-dinosiglip-224px-0_5b \
---config_file_path pretrained_models/configs \
---data_root_dir data/libero \
---dataset_name $data_name \
---run_root_dir outputs \
---use_film False \
---num_images_in_input 2 \
---use_proprio True \
---use_lora True \
---use_fz False \
---use_minivlm True \
---image_aug True \
---num_steps_before_decay 150000 \
---max_steps 150005 \
---save_freq 5000 \
---save_latest_checkpoint_only False \
---merge_lora_during_training True \
---batch_size 16 \
---grad_accumulation_steps 1 \
---learning_rate 2e-4 \
---lora_rank 64 \
---use_pro_version True \
---wandb_entity "YOUR_WANDB_ENTITY" \
---wandb_project "$data_name" \
---run_id_note VLA-Adapter--spatial--$current_time \
-> logs/VLA-Adapter--spatial--$current_time.log 2>&1 &
-```
-
-Please note that the obtained models will be stored in the `/outputs` folder. Each model will take up nearly `3GB` of memory, so you need to reserve enough space. We strongly recommend that you get our trained model from [VLA-Adapter HuggingFace](https://huggingface.co/VLA-Adapter) and place it in this folder for inference.
-
 ## :mechanical_arm: Inference
 
-### :books: Related File for Inference
-* `experiments/robot/libero/`: LIBERO eval files
-  * `run_libero_eval.py`: LIBERO eval script
-  * `libero_utils.py`: LIBERO eval utils
-* `experiments/robot/`: General eval utils files
-  * `openvla_utils.py`: VLA-specific eval utils
-  * `robot_utils.py`: Other eval utils
-
-<br/>
-
-### 🤗 Checkpoint of VLA-Adapter <a name="ckpts"></a>
-We fine-tuned `Qwen2.5-0.5B` with our adapter bridge paradigm on four LIBERO task suites independently: `LIBERO-Spatial`, `LIBERO-Object`, `LIBERO-Goal`, and `LIBERO-Long`. 
-The four VLA-Adapter checkpoints for LIBERO are available on Hugging Face:
-* [VLA-Adapter/LIBERO-Spatial](https://huggingface.co/VLA-Adapter/LIBERO-Spatial) 
-* [VLA-Adapter/LIBERO-Object](https://huggingface.co/VLA-Adapter/LIBERO-Object)
-* [VLA-Adapter/LIBERO-Goal](https://huggingface.co/VLA-Adapter/LIBERO-Goal)
-* [VLA-Adapter/LIBERO-Long](https://huggingface.co/VLA-Adapter/LIBERO-Long)
-
-In addition, we also provide a `Pro` version, we used `4*H100` GPUs for training, `--batch_size 16`, `--lora rank 64`, and the `--max_steps 100000`. The Pro checkpoints is:
-
-* [VLA-Adapter/LIBERO-Spatial-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Spatial-Pro) `(97.8 -> 99.6)`
-* [VLA-Adapter/LIBERO-Object-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Object-Pro) `(99.2 -> 99.6)`
-* [VLA-Adapter/LIBERO-Goal-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Goal-Pro) `(97.2 -> 98.2)`
-* [VLA-Adapter/LIBERO-Long-Pro](https://huggingface.co/VLA-Adapter/LIBERO-Long-Pro) `(95.0 -> 96.4)`
-* [VLA-Adapter/CALVIN-ABC-Pro](https://huggingface.co/VLA-Adapter/CALVIN-ABC-Pro) `(4.42 -> 4.50)`
-
-These files need to be placed in the `/output` folder. If you trained your own models, it will also be stored here. The subsequent eval code will call the model in this folder for inference.
-
-
-<br/>
-
 
 ### :notebook: How to Eval <a name="evals"></a>
 
-**We strongly recommend that you use our open source `Pro` version of the model, which has stronger performance.** To start evaluations with one of these checkpoints, run one of the commands below. Each will automatically download the appropriate checkpoint listed above. If you want to use the original version of the model, you only need to adjust the `-- use_pro_version` parameter to `False` and pass the original version of the model to the `--pretrained_checkpoint` parameter. Finally, the inference results will be displayed in the `/eval_logs` folder, and the inference video will be displayed in the `/rollouts/vla-adapter` folder. 
-
-
 ```bash
-# Launch LIBERO-Spatial-Pro evals (Background running)
-CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \
-  --use_proprio True \
-  --num_images_in_input 2 \
+# First launch your own robotic arm, then run the following code
+python experiments/robot/aloha/run_aloha_eval.py \
+  --pretrained_checkpoint <your_chkpt_path> \
+  --use_l1_regression True \
   --use_film False \
-  --pretrained_checkpoint outputs/LIBERO-Spatial-Pro \
-  --task_suite_name libero_spatial \
-  --use_pro_version True \
-  > eval_logs/Spatial--chkpt.log 2>&1 &
-
-
-# Launch LIBERO-Object-Pro evals (Background running)
-CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \
-  --use_proprio True \
   --num_images_in_input 2 \
-  --use_film False \
-  --pretrained_checkpoint outputs/LIBERO-Object-Pro \
-  --task_suite_name libero_object \
-  --use_pro_version True \
-  > eval_logs/Object--chkpt.log 2>&1 &
-
-
-# Launch LIBERO-Goal-Pro evals (Background running)
-CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \
   --use_proprio True \
-  --num_images_in_input 2 \
-  --use_film False \
-  --pretrained_checkpoint outputs/LIBERO-Goal-Pro \
-  --task_suite_name libero_goal \
-  --use_pro_version True \
-  > eval_logs/Goal--chkpt.log 2>&1 &
-
-
-# Launch LIBERO-Long-Pro (LIBERO-10) evals (Background running)
-CUDA_VISIBLE_DEVICES=0 python experiments/robot/libero/run_libero_eval.py \
-  --use_proprio True \
-  --num_images_in_input 2 \
-  --use_film False \
-  --pretrained_checkpoint outputs/LIBERO-long-Pro \
-  --task_suite_name libero_10 \
-  --use_pro_version True \
-  > eval_logs/Long--chkpt.log 2>&1 &
-
-
-# Launch CALVIN ABC→D-Pro evals (Background running)
-CUDA_VISIBLE_DEVICES=0 python vla-scripts/evaluate_calvin.py \
-  --pretrained_checkpoint outputs/CALVIN-ABC-Pro \
-  > eval_logs/CALVIN--ABC.log 2>&1 &
+  --center_crop True \
+  --unnorm_key aloha_put_x_into_the_box_80_demos \
+  --num_rollouts_planned 1 \
+  --max_steps 500
 ```
-
-If you want to get the inference **throughput**, you can run it in the `run_libero_eval.py` file. You can add  `start = time.time()` and `end = time.time()` before and after `lines 334--345` and calculate the difference between the two. This difference is the time it takes to generate `8 chunks`. This gives you the inference throughput. We measured it multiple times and took the average value of `0.036s`.
-
 <br/>
 
-## 🌈 Success Rate Comparison <a name="results"></a>
-
-All our results are inferred on `H100`. You can find the inference `log` file in the model released on [HF](https://huggingface.co/VLA-Adapter) for viewing. The evaluation script will run 500 trials by default (10 tasks x 50 episodes each) in LIBERO and 1,000 task sequences in CALVIN. Use the same card for training and inference whenever possible. **Note that results may vary slightly if you use a different GPU than the H100.** This phenomenon is also mentioned in the OpenVLA-OFT readme file.
-
-### Performance on LIBERO benchmark. 
-
-<b><i>XX</i></b> represents the best performance, <b>XX</b> represents the second best performance, and <i><u>XX*</u></i> represents the third best performance.
-<table>
-  <tr>
-   <td><strong>LIBERO</strong></td>  <td><strong>Methods</strong></td>
-   <td><strong>Scale</strong></td>  <td><strong>Spatial</strong></td>
-   <td><strong>Object</strong></td>  <td><strong>Goal</strong></td>
-   <td><strong>Long</strong></td>  <td><strong>Avg.</strong></td>
-  </tr>
-
-  <tr><td rowspan="10">Large-scale</td><td>FlowVLA (Zhong et al., 2025)</td>
-   <td>8.5B</td><td>93.2</td><td>95.0</td><td>91.6</td><td>72.6</td><td>88.1</td></tr>
-
-  <tr><td>UnifiedVLA (Wang et al., 2025)</td>
-   <td>8.5B</td><td>95.4</td><td><i><u>98.8*</u></i></td><td> 93.6 </td><td>94.0 </td><td>95.5</td></tr>
-
-  <tr><td>OpenVLA (Kim et al., 2024)</td>
-   <td>7B</td><td>84.7</td><td>88.4</td><td>79.2</td><td>53.7</td><td>76.5</td></tr>
-
-  <tr><td>OpenVLA-OFT (Kim et al., 2025)</td>
-   <td>7B</td><td><i><u>97.6*</u></i></td><td>98.4</td><td><b>97.9</b></td><td><i><u>94.5*</u></i></td><td><i><u>97.1*</u></i></td></tr>
-
-  <tr><td>UniVLA (Bu et al., 2025)</td>
-   <td>7B</td><td>96.5</td><td> 96.8</td><td> 95.6 </td><td>92.0 </td><td>95.2</td></tr>
-
-  <tr><td>CoT-VLA (Zhao et al., 2025)</td>
-   <td>7B</td><td>87.5 </td><td>91.6 </td><td>87.6</td><td> 69.0</td><td> 81.1</td></tr>
-
-  <tr><td>WorldVLA (Cen et al., 2025)</td>
-   <td>7B</td><td>87.6</td><td> 96.2</td><td> 83.4</td><td> 60.0</td><td> 81.8</td></tr>
-
-  <tr><td>TraceVLA (Zheng et al., 2025)</td>
-   <td>7B</td><td>84.6</td><td> 85.2</td><td> 75.1</td><td> 54.1</td><td> 74.8</td></tr>
-
-  <tr><td>MolmoAct (Lee et al., 2025)</td>
-   <td>7B</td><td>87.0</td><td> 95.4 </td><td>87.6</td><td> 77.2 </td><td>86.6</td></tr>
-
-  <tr><td>ThinkAct (Huang et al., 2025)</td>
-   <td>7B</td><td>88.3 </td><td>91.4</td><td> 87.1</td><td> 70.9</td><td> 84.4</td></tr>
-
-  <tr><td rowspan="7">Small-scale</td><td>4D-VLA (Zhang et al., 2025)</td>
-   <td>4B</td><td>88.9</td><td> 95.2</td><td> 90.9</td><td> 79.1 </td><td>88.6</td></tr>
-
-  <tr><td>SpatialVLA (Qu et al., 2025)</td>
-   <td>4B</td><td>88.2</td><td> 89.9</td><td> 78.6</td><td> 55.5 </td><td>78.1</td></tr>
-
-  <tr><td>π0 (Black et al., 2024)</td>
-   <td>3B</td><td>96.8</td><td><i><u>98.8*</u></i></td><td>95.8</td><td> 85.2</td><td> 94.2</td></tr>
-
-  <tr><td>π0-FAST (Pertsch et al., 2025)</td>
-   <td>3B</td><td>96.4</td><td> 96.8 </td><td>88.6</td><td> 60.2</td><td> 85.5</td></tr>
-
-  <tr><td>NORA (Hung et al., 2025)</td>
-   <td>3B</td><td>92.2 </td><td>95.4 </td><td>89.4</td><td> 74.6 </td><td>87.9</td></tr>
-
-  <tr><td>SmolVLA (Shukor et al., 2025)</td>
-   <td>2.2B</td><td>93.0</td><td> 94.0 </td><td>91.0</td><td> 77.0 </td><td>88.8</td></tr>
-
-  <tr><td>GR00T N1 (NVIDIA et al., 2025)</td>
-   <td>2B</td><td>94.4</td><td> 97.6 </td><td>93.0 </td><td>90.6</td><td> 93.9</td></tr>
-
-  <tr><td rowspan="5">Tiny-scale</td><td>Seer (Tian et al., 2025)</td>
-   <td>0.57B</td><td>-</td><td> - </td><td>- </td><td>78.7</td><td> 78.7</td></tr>
-
-  <tr><td>VLA-OS (Gao et al., 2025)</td>
-   <td>0.5B</td><td>87.0 </td><td>96.5</td><td> 92.7 </td><td>66.0</td><td> 85.6</td></tr>
-
-  <tr><td>Diffusion Policy (Chi et al., 2023)</td>
-   <td>-</td><td>78.3</td><td> 92.5</td><td> 68.3 </td><td>50.5 </td><td>72.4</td></tr>
-
-  <tr><td><b>VLA-Adapter (Ours)</b></td>
-   <td><b>0.5B</b></td><td><b>97.8</b></td><td><b>99.2</b></td><td><i><u>97.2*</u></i></td><td> <b>95.0 </b></td><td><b>97.3</b></td></tr>
-
-  <tr><td><b>VLA-Adapter-Pro (Ours)</b></td>
-   <td><b>0.5B</b></td><td><b><i>99.6</i></b></td><td><b><i>99.6</i></b> </td><td><b><i>98.2</i></b></td><td><b><i>96.4</i></b></td><td><b><i>98.5</i></b></td></tr>
-  
-</table>
-
-### Performance on CALVIN ABC→D benchmark. 
-
-<b><i>XX</i></b> represents the best performance, <b>XX</b> represents the second best performance, and <i><u>XX*</u></i> represents the third best performance.
-
-<table>
-  <tr>
-   <td><strong>CALVIN</strong></td>  <td><strong>Methods</strong></td>
-   <td><strong>Scale</strong></td>  <td><strong>1</strong></td>
-   <td><strong>2</strong></td>  <td><strong>3</strong></td>
-   <td><strong>4</strong></td>  <td><strong>5</strong></td> <td><strong>Avg. len</strong></td>
-  </tr>
-
-  <tr><td rowspan="8">Large-scale</td><td>UniVLA (Bu et al., 2025) </td><td>7B </td><td>95.5 </td><td>85.8 </td><td>75.4</td><td> 66.9 </td><td>56.5 </td><td>3.80</tr>
-
-  <tr><td>OpenVLA (Kim et al., 2024) </td><td> 7B</td><td> 91.3</td><td> 77.8 </td><td>62.0 </td><td>52.1 </td><td>43.5</td><td> 3.27</td></tr>
-
-  <tr><td>OpenVLA-OFT (Kim et al., 2025)</td><td> 7B</td><td> 96.3</td><td> 89.1 </td><td>82.4</td><td> 75.8</td><td> 66.5</td><td> 4.10</td></tr>
-
-  <tr><td>VLAS (Zhao et al., 2025b) </td><td> 7B</td><td> 87.2 </td><td>64.2</td><td> 40.9 </td><td>28.1</td><td> 19.6 </td><td>2.40</td></tr>
-
-  <tr><td>LCB (Shentu et al., 2024) </td><td> 7B</td><td> 73.6 </td><td>50.2 </td><td>28.5 </td><td>16.0 </td><td>9.9 </td><td>1.78</td></tr>
-
-  <tr><td>RoboDual (Bu et al., 2024a) </td><td> 7B</td><td> 94.4</td><td> 82.7</td><td> 72.1</td><td> 62.4 </td><td>54.4</td><td> 3.66</td></tr>
-
-  <tr><td>OpenHelix (Cui et al., 2025)  </td><td> 7B</td><td> <i><u>97.1*</u></i> </td><td>91.4 </td><td>82.8</td><td> 72.6</td><td> 64.1 </td><td>4.08</td></tr>
-
-  <tr><td>ReconVLA (Song et al., 2025c)  </td><td> 7B</td><td> 95.6 </td><td>87.6 </td><td>76.9</td><td> 69.3</td><td> 64.1 </td><td>3.95</td></tr>
-
-  <tr><td rowspan="4">Small-scale</td><td>DeeR (Yue et al., 2024) </td><td> 3B</td><td> 86.2</td><td> 70.1 </td><td>51.8</td><td> 41.5</td><td> 30.4 </td><td>2.82</td></tr>
-
-  <tr><td>RoboFlamingo (Li et al., 2024b) </td><td> 3B</td><td> 82.4 </td><td>61.9</td><td> 46.6 </td><td>33.1</td><td> 23.5</td><td> 2.48</td></tr>
-
-  <tr><td>VPP (Hu et al., 2025)</td><td>  1.5B</td><td>  95.7</td><td>  91.2</td><td>  <i><u>86.3*</u></i></td><td>  <i><u>81.0*</u></i></td><td>  <i><u>75.0*</u></i></td><td>  <i><u>4.33*</u></i></td></tr>
-
-  <tr><td>SuSIE (Black et al., 2024)</td><td>1.3B</td><td> 87.0</td><td> 69.0</td><td> 49.0 </td><td>38.0</td><td> 26.0</td><td> 2.69</td></tr>
-
-  <tr><td rowspan="5">Tiny-scale</td><td>Seer-Large (Tian et al., 2025)</td><td>0.57B</td><td> 96.3 </td><td><i><u>91.6*</u></i></td><td> 86.1 </td><td>80.3 </td><td>74.0</td><td> 4.28</td></tr>
-
-  <tr><td>MoDE (Reuss et al., 2025) </td><td> 0.44B </td><td>96.2</td><td> 88.9</td><td> 81.1</td><td> 71.8 </td><td>63.5 </td><td>4.01</td></tr>
-
-  <tr><td>Seer (Tian et al., 2025) </td><td> 0.32B</td><td> 94.4 </td><td>87.2 </td><td>79.9 </td><td>72.2 </td><td>64.3</td><td> 3.98</td></tr>
-
-  <tr><td><b>VLA-Adapter (Ours)</b></td>
-   <td><b>0.5B</b></td><td><b><i>99.1</i></b> </td><td><b>94.6</b> </td><td><b>88.8</b></td><td> <b>82.8</b> </td><td><b>76.5</b> </td><td><b>4.42</b></td></tr>
-
-  <tr><td><b>VLA-Adapter-Pro (Ours)</b></td>
-   <td><b>0.5B</b></td><td><b>98.5</b></td><td><b><i>95.0</i></b> </td><td><b><i>90.5</i></b></td><td><b><i>85.3</i></b></td><td><b><i>80.0</i></b></td><td><b><i>4.50</i></b></td></tr>
-  
-</table>
-
-
-<br/>
-
-
-## 📝 Citation <a name="cite"></a>
-
-### 🫶 If you feel that this paper, models, or codes are helpful, please cite our paper, thanks for your support of VLA-Adapter!
-
-```bibtex
-@article{wang2025vlaadapter,
-  author={Wang, Yihao and Ding, Pengxiang and Li, Lingxiao and Cui, Can and Ge, Zirui and Tong, Xinyang and Song, Wenxuan and Zhao, Han and Zhao, Wei and Hou, Pengxu and Huang, Siteng and Tang, Yifan and Wang, Wenhui and Zhang, Ru and Liu, Jianyi and Wang, Donglin},
-  title={VLA-Adapter: An Effective Paradigm for Tiny-Scale Vision-Language-Action Model},
-  journal={arXiv preprint arXiv:2509.09372},
-  year={2025}
-}
-```
 
 ## :heart: Acknowledgment
 
-We thank [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [MiniVLA](https://github.com/Stanford-ILIAD/openvla-mini), and [RoboDual](https://github.com/OpenDriveLab/RoboDual) for their open-sourced work!
-
-## 🌟 Star History
+We thank [OpenVLA-OFT](https://github.com/moojink/openvla-oft), [MiniVLA](https://github.com/Stanford-ILIAD/openvla-mini), [RoboDual](https://github.com/OpenDriveLab/RoboDual) and [VLA-Adapter](https://github.com/OpenHelix-Team/VLA-Adapter) for their open-sourced work!
 
-<a href="https://www.star-history.com/#OpenHelix-Team/VLA-Adapter&Date">
-  <img src="https://api.star-history.com/svg?repos=OpenHelix-Team/VLA-Adapter&type=Date" width="400" height="250" />
-</a>
 
diff --git a/experiments/robot/aloha/__pycache__/aloha_utils.cpython-310.pyc b/experiments/robot/aloha/__pycache__/aloha_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6de2b194a7292284da29a16da0ed63a0e4cb57
GIT binary patch
literal 2740
zcmai0&2QYs73XltT`u=SvH~Zu(YVuI;sw0BK<c(YQa4d-0a4*Nf~E98pdhH7(Jm#D
z!w$K!yr51^kNyh+1Jcpw{tNjFa_E8A9s-|p^CkJc*|ntFDLRY!<eN9|<M-ZM)$92d
z+P}W<#Xoc`>p!%)+8Wq=hF&Z%2rIAz6L!J^W@bCE&D;na%#FlNn?aKijgz!eFYwIT
zP1>m+_-QBTVBZw2Z>*pzJkiG76TawR?u)MIVO|q`v4(kFY>0Jn?Hd+c6ZWn(cxUnH
z*JYd({6J|gkE3KBm2o!Zxk}<mEDIiIT+1jKK2bUmeDmQ~Uu<%j9miT_sm#h^^b!hK
zj|QxEx6^1UU$DBDMn^KFTOlY1cI`aa+}o<#G=_WIUv5np-B~poGg;$4dhr-T1wD^!
zeO<6qR>2-vZ<Lm>|8cP|{ED4gSqG<$%Ts+Tv+2Bql`897>p5;PaD$fa;=>)S(!Av5
zO!9dlH7^xZ%je6JOrBs76-T4mT3p}CN+JLy9q}wm<;Q&S`7a#v+pt<9qhrbUlPEhP
zu3F|YTC&ZxJdV}8Fnk8!(cA;Wk2y@@n#XH5*Kt-3?7*j|!X(<4N$rz>p&@u-X`sh?
zwLPAxRE|qol;dRfbewB7)loYB>%-0ArigL~JeDW9)Uk2OSgU<ij-y1)qU9LQ$;+es
zY5mSrmLVw|UOu@_wtpXkXFJTJ-%m~3vpr@r$L=$S=^j2#&@URf@WO3<Y~Tz)_!%5v
zOx{?GwzIg5qa<DcMo}hUnR2c(SOb=Q3rM5A0SnqOu7sHqQWI~0ssRKU6D)@=PU{|N
zV00sV{RT&cgTaUFdJT)W;MK&Tf!A#;2v((a#!CCFQCa8g(5V2k1G}^jUC3{rUuy83
z&(ZdP=7MMQG=It?t)r)WqO_Jpt};QP0-r~`z$%eErVv;uTNuUh^`qba;RO8-mnYGr
zOu(h!LS@o$M|@q{Lqui7ff~4XA`9Zli;0f&@(zD8izhQ47ldBH_a)+0LXhlOY7?-;
zgsvA7*i-!R!w0(~^T10Bu7Yhnh#aMPA_vXd&hpwdx@Zc&pnZVoF#Ea*G6-I`KHt9o
zH9D}`h|_6s-FPhAdAzmr_^bO{ySw4;cChuPYi=!XhsF*eWDNgHn-B3=L5b0E9riwR
zesUW6U2Na-7)eDQ`|&1*$~uJCZXgBRu;4p*YZ>$i>;}T}q|`7g!7A2bh!`kTwsHii
z#nnE-=@@BE0eghQOOYtQm7}_8#4T!X5~blRp3Yv_b?3@8BO@6CXZkuMdQ<JSOGFx0
zzXMG{sOvX8ik=U!{GU4N_wW^({N+Lu#O{ng(Q#37NF;T%R0m1=|LX%Bq$5qS9ITN#
z8tRmxVW!EL16%(JQ@wWO1Kc%zQ>Aw=RWg^CE-gO6-KAy@<qG|4>?fq@>qZgUMmO>H
zZ!83&jgs+cvt);M)fBd9JhPFQ8)uDk>&y|3aL;Yb&DGqZnabEd$=|K2`K=}V(#6=R
z7{+d8W9(HeaBZJ@l`FhC)1OqH=vSb6ZvWB3sRx=hyxK0ChpnnjMemt?>Q~kS>u;^o
zPUW9@Rp%T9;_us7x6ctJr7t$B_C?`v4BxnN&N|C;(R|*Z8&%sB&fo4L?-Qc<mpgYU
z?I%bL>VOVp9v4VrqeW+gRGcJgUKV2{xZ0B^@M@@zUNK&?ux5VEMvq>81UtQA<HdW!
zp~{idk|<OM<#4#-b8q1fhsAtCIpN--H5^jrzPGT4!-X?S^Sdu^;66gUagY~_n<N&J
z?ebY88_H200aAmmc0pO&3RtHiEn;2NZlaz@UAK$|6RF9Yb<5y5R)J4A00K~9f=-k}
zIuTNysohDU5Lo2v+BF1&ZqQ#nyFv?6hd8NQE79w9Q*ne`<}}1(?Img&9>j^P-Atjn
z1)bLnAX;#9l?SC*)~YKvgBIN?qiNv503oRtbdO^p)k@f3jMI$*wh3Q9VaT^2e{l<g
zk3x5waTLEk^U*!lHEWM;Pzc`z$-sW}Xs}VY!ceG57-}jf`VIz2Pvt>hqjh6v`{6RJ
zBH>?@<dG@KBU83Uum2lqs$2C3Z<86~Wz9A&Gfpu@*i<-^kW5xF0ceD~62SDAL7P(P
j{V7#qp2$B_xh^RF9n^TL>|NHi*O}M!uAz55ukZg1;0^2r

literal 0
HcmV?d00001

diff --git a/experiments/robot/aloha/__pycache__/constants.cpython-310.pyc b/experiments/robot/aloha/__pycache__/constants.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e853a05a36002cca75d19209eb0699f16e5a46ce
GIT binary patch
literal 3271
zcmb7G&2Jl35Z_&Y#BH3^ZPVD!cbYV<k&2~oLV_AMaUw&q4z?SK2hy_6Th^A=Uh8>7
zJ{q_b4&^{4gb;@cA#OQ<P*0UOr0^G1s<>=VRjI0qFR2ApLaM~fTgS0oZw$NInfEa}
zAHSKMw`O18Q30;EU+|PZ9T9~4IBCB8FnJB0T0|5CDp1i7h{Hu9Qi({KL_J}FdZ~~4
zb&2*|_rOY^+X_8$-9tR|DDl!>;?o5h{6-{x;C;Y*Xg>+iW8?@8k)!lD>7^$~ke(!c
zbb$2JQ{)&Oq~VVQ5~2~#kJBjUC+HC8C+RTf19XJ*Q*@N`K|02Hn2vKEp%a`(=_Kbv
z^fc$gbc*v4I?eegJ;V7Jo#A|33u=83;n`t<Oi(c=#LhiJ%;fE%c+o6taZ9UNajIHs
zTs6$1sxVD8V6VDvDQjwVtDrJfDOD^}v5J~fFgHto0l0QxO)V?Ml3uir;fTS=QZ`tr
zX2m>!0jR5KmO@L6^lH^o&7_)QXcZC!VGR^kfYr96+=`q`FJ<J#79*e}>goZB;8v&C
zNEml6suj3f^y!xezFWMv{dwZiEW{r36W@jkzLG$#Xs#PnW5lN!1#^Q0-!lOK1`(Nt
zL3(*aidxYPjRc(uGcDE9Vj%`F2oG<Lef;a2H~-jsc<raZ&%JZ)al%=CiC=g2oEiD5
zXG8z}i^RkFz*%NKP{6QexfJVZi~=}10z`kgaQ=et)|Ux=XMXAG^!C++9`&yFpTF^K
zLVxz-t=gaWK1=Age_s79{==Pwe)p4S+S6Yi?CBTZeQ^8q-KPmXd-m(A-`~5xw}!s%
zUpV%_GY8L;5Re_g5+Z^nA{`cT!mcRCBztr@naiiL%3?OXvI2Z1lS}8*8CjWI%H&e^
zh}(8%B_;2Qwz#<~Fx39fVpE+N#jSXinL1O;@ztf|`6N}Vmd4`RW>sUQvR1Kbab^}w
z%PGeAnpvq?Dy+U(-LeC(7;3pd)e8oCE&&M%>;y2^@EG<B)(Zp!TDb&)<Xr=D+s1%c
zKPtj-9C92-Xu6DplR#X3Xh3X<5n%ve4g<v8r(KEl;p;&pZU!n{4X{BJ!5~55DI5<V
zaZ_mG=HVWrLl$_tBU}-$K+D)iaK;UyOR}8FE+?1LB&96K_Ox4VG)uWUSOx1myFqtQ
zTM|3eX72S&TF%$YG}>|R(3G;S0JU0g1Es}!$Gt<X*ScG8<U1y@F%%y~;+Ajg|0$e6
z@$thHu6I`$MUnbvH`Nh?wu+hTn5O?!F7Kl)_#1F9mM2)x*yQjF#0JM*V_|&J4Ykd#
zTG<qeV2~_wn5M0+nx5eIYJrDWL}<bi?-Ej@OT;{E7*%x7^kop^AI`w@Sa@NNH~f8D
zh=nCNh#Y+59($%O6OFdtnlY#5xKC~n?x5FafVj3sHjN^@PCWK>Tb(ovw^YFdzHw-I
zet;4k&F*8sW)94<p@zZ<9JvMaA#S49M|keUCheZ8Y88!A!IsP#3Ct(+No78rCBb|$
z_l7c;kr&d7Ia`{~6VJj*j`TWRK~65Ga&~_%pUmc!WOf-3)l}0VaL~OEvOR&8dBnU7
z&f)Yd67-zZ``c_Cyas3sGCPg?lWnNh{b=geWU4LWj!0X&CsEm;Be`$P5nPZ-d%s^z
zEoJ7?`L~qj)IFErj#4C4_XZ-F<pvrnd^QW~By<q%zD#}PKqk^Ls^#>2(<#o+b3Dr7
z*P?9Fbwj&=KSwnJBp?Pv@2h^FB>F|~Gk;GTf9wr=K>}nvUXkB*9`NESzonK@eHN1b
E1z1wCX#fBK

literal 0
HcmV?d00001

diff --git a/experiments/robot/aloha/__pycache__/real_env.cpython-310.pyc b/experiments/robot/aloha/__pycache__/real_env.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d9dd0a30981643af847ba936b27492ee956e6db
GIT binary patch
literal 7934
zcmcgxOK;rP73OPZIHS=>mgRTs(3X>U8pm?tBu!nXjcQw|QhVfBlNW=4q2|&f%6yD3
zIkv2f(1lZ=iyGZ@SD=uW+O~h7D9}H_Re^5mU4bkL6amuj+)GNDv7Ew4S`K)5ABTtM
zea^iTkB`?BTz~v$!{2&QQT|SY-ItERJGhgt(J+Onfnv+2YHMPw+q!rfVaYD5a;;)l
zFsB7(ShcHSP7i9~m^~KO?Rq$FkB1GrA?6LtPuLUTq&+FdC5-pk`@$)EN{q|F{&3o!
z4iDG|#JGa-LHl5M$UYPvwhxC#>?2~{4337!>|>%|4W0{++s8#;4Nll6yc4U>+t#}Z
ztFf^!6*l%jvrn=*8~;+VPq71RlI{CawP)BA+mD&k-V1E{w#EnE=?4mMs7vB*9()W5
zD6OWN)!y&BsmCKfj+#cMU1?{HrS{y-wljD0x^rW3>1uZJ`rK0c>P_d}n+rE?;Q8L-
zLaXhx7H?jkyS89ob>>^y%#9DQ)E)u(ptT#Qsb!}}z<_?Cy|CCCMm7rd;ni!4mlxWf
z3_$_gkr8Z?uEiTyTiMY)mo8reJ9yE&>2+exJf4j&gl^X>hTwlP@zQ?J3Ec!TahBt>
znCbO<JuekA)iA#6IW7;AY+Q7^+{YqMAk9My5njZ5{@Qgn@_YTjO=JG?5|*aki~T5F
z@}q9xU5>-hjaY}2u>0CJ&|hQLQ*4zfFiaIjd5RgV1jQPx%qmc9iJ7d5XIYv7OjMeH
z$r@||^HpI8ie1YMVT?_)1DLO~gX|EV<7|c<VMl?n!H%)#Ff+kUv*YXpW+vGS%wi|e
z+sB??QkpMjm7AU$T#fF!zbh+B&BBF&&*LccqSU$-bBhDO;xRb4mi<@2TpwdRqzGVK
z;;p$(imsLP+>UpV&{)=|-0!ZWBu?i}FHWp8Z#K`$mAsv?+~p(=`l)BFk_0V4`GiJb
z90JV;5@W6vrvL})S0(BhD~fsO2L8IotQ>3`t99o5MXM9UiFej|^`aH`yr}tKzVuYu
zzhBZnv#e*QT#&xFoOt}MyIb~OoFZwTx@cXFqmG+?YJIC3o}H0Vz3i=%TqV!W)+qX?
zjlYen@0%%UzOOc+CtTkjPp9tO^@6=Mg@s{<d4eZE#2r7qZ=K;Td%7Wy%5PX*J4l2F
z?aA5>4D3q!4MN>nEAqP1pD=pX?+eHJKk-OHCUg82?R_^EPx+K<$mPB`)p0{-1#u69
zGq1mOe(lZI&#ztBiF-5F%315)S*vr_V(I;!_x4MD1irUkdTJcnr9Oc>i0?fGHSF>`
z;+CdykNZjb#5ewT$Tz;7@b-Mi*p*1BU%`DDck&*Z6lD+7n9huW%1RGQ1I1AX8lM~}
zo7y83MyXDDm6;DUtQcuY^i}lBX$5apA5!0B3bJ9XWtZ=+T{wR}$#fWKR>?DnUC(O-
zA)u|2Jtoqj6UNL7vRaqNC@dm3^z3?(n#8Ck!Kj~A{m4(92ybL1fo6V7<%dB(>IZ`$
znpM6J!)$V=9&rR<R+p8FK$h?^0&4KQ&XFLzE+&N;I>i#7qREMZqK+&~Pzf1{7u*ux
z=En)7Ch;%3iMOTj8BG0qes(1ez1h@D(%E3;{%nuOUG9dnA6}b#WsbQ$R6?`f8Y&~7
z%7kQA&^GG^@rpZ3^@8I?chB|iXQtx_zMb>HnoOcGbwf43F?3UHl<UT%x;~i~Am<9u
zjta2iqP~V(sJnqqstlA>^?^!N^OCZqwwfBJdHx(4i1Z}&i3i!7rXF#51$RQ28tVFV
z@yXqS9r`R;8Qt{RuZVVh6rbHtVPmkjUuj2_4K39$LhpgvRX6lh-!ul=K<{dgP>OHq
zt;Z;+l*eSN%@RL>M`lnS;WU$JQ7;?ohy?9LsM4E;pxkaqA0$AByZ4H5QHR)5a=lQh
zKsd~8W_D4{(!#s=&{rlJQ?08F{B~qrfIO9Heg^OHCYl}M5hOXG<F8P2j@Cw~2E-Fo
zW9|v63DcfbW8pd>Cy|d(i=`oIb%4-)8Fyl#*;_@M`atWdq>`rA;x7XZL<?<{y;~@6
z(_P%?QU{Y##&TKklS^H?bN<>0{WTJ)Tv{TENckl9y1G79nA;A>5GO-6PTO4KWK(?v
z`{vl|f^hyk9tbq}giJO3B}|NnkiUVsPjCw}uZ?_HzSkiSD6Sz1lD^?v@T?6DW(;F@
zsMS}E)zWHtvqCeEa<7wAnVfj!7Me`&giLrKKR`nRn=sF=_4><BB6dY~1^vKJR+_nt
zPY>HM^AewXj4FD>$fAO`kL?X5HVY%%i`|@4_yzEIAt=e!V5*1J^}~f4;Ppd*L&_il
zm2eBQqUeGAQh2<YB3G=!oCffC$~g~p8HSohi~j(yk4cqVsx9}CDS{*Tr8vSTsG$%A
zWm1TOV|zpg2o*b?QEtW&#%W!!nsB_45S}_Qa-d*j2xf>2qGOwjvlz&**;P|ePx+E4
z-=5Pkr^`8Jz)?!;pD%f-mBu?;CrWh|_O%w<H7Q6$MR?2SNh<bJR=VOva*uV+YK#7g
z8+AQv+3nmWgf~9AZV|37Xu|ckpYn@%@E@UR!7(XANIR-hkDOsIQ_mVX>$$$~0#3+h
zT7!HJj>snm8WIqxne1Pv`4T#*@Hhl`Q8pm3tC!%1oBAX81k*&{5Pe<rOQLU}Pm!h6
z;v{7LDm53-WM%1Z%`!##An?TQILXEq>2ShhSNu-OX9@Z>YF?*?+&8z5Oplk7QIjwo
z@_#IR0l<VZzKkyvV~ky~y#V861W!0T`K(}?K1^)l2+2(0+^broi>HpvrmRvfQ#QdS
zBQccm)E<&;=q-VF%gCxYNa;CL#Aapg-E%p!P39+^xF4n280aiErp-!LW+9aXnYn~_
z+Vo0ke&Oa)JJYW(T*=HfJcU+dh?B_p90<(AMW&{lh#nQmL@tryOAi2(P&IHE64BJM
zxXWL!A1W+j1b)aQNL+La>5#M0ks2ipnv`GOE9(9NtG~f5qAv6>6zwG|LrNga+g8FA
z3<helA7KS5*u;Slg_TEePHB7M3A=-{A`0WKteRM<=4z?oX$ZBHT3K0!S`mTU{Aa-2
z9235j>1-{q_si_Xz2!JgQM%&rD&ao{6esiN^VCp1&8ckQKc|LR<CKpJvovnGx4ls%
zltdW#Ex?j#G=@5*nW_l|ly?*`UMN7owmKA>ii=8$ZGl7>qzd9J5)o2S2^OjhN}}E>
zQ|zULG$<DlS;SyX2&sfw<A_2jYGCj)WU~YPk-AZdaQwSbNg+7g)zunS#)SAP*e#Gn
zmu2jpM<H%5V@rJ-8SC*kxTFm0nT{P=rUzamBNH)c@81s2+aWyRZ(&8qYgC#Gv_R5q
znj-pVWmg0QdyI;1Cm%G(I#Z4VH11H<RLQN?hNq!KbU=Y!ml;vM!QUagGuzE+^ibWy
zk$#U}auf{(hof3UHL3^_TA>rT#@FlhLMQaXmez&<gsC^xd3gDJyE#SWJ8exIC#yNM
z+vx|?uRG2ioB`%56~|$5$8q=zL<g1Hdj&Qrn;ml_=a!-pM)p1pKcZ%b8_^?{_<l6R
zj41O=SXk|VIc@4@S^TEWl4+QwnrT*RdZR9&ha8(cj{I3T_D|5ESkF++P_&mO1%uMa
z$jIENcM1Nsy(UWhJPS}rKnQ#pSVYEemPP%TmDW9u+I|u9c3mWGM<^}H%<U9UMvpV<
z$(<wUeE0StVcTI`s8JMqQq6zDazb8GgDMYdhQ>d~0Q+JckK!T*x0$3gO4)sYo$v#F
zLm%ixDX9waAk)&;Q)=rJdYO*sJW9fN2yaKCmD{;L11zsW43#fq)WN-dQL3)s7DE0D
zI!xVwC^uBpPuQEY5_aJ#t2`_ryAW0I(^R9A?uXbr4fItVt2OkEfq~r=gwHRe2IZfP
z@}LY~HdZTaj1)vZ4u3W_O%WEWto}f~qI~w!hMCp~??!c?Z;lPBfHDT9&HBI)F+t&<
z3@V%B!+1c0vDru`2;$BU)5*<!MW2mB(GB+xi2dSdrj4C2j=%HM@6-!C93OLe?n-4*
zvJ&63M9#N%Wxm9Uqol(<FFGf%EZ{_4)JhO4OLh*uy~Mi3<Iutx`wEWM;gH<ImM@WS
zi8`vEc$S}9ZemHJNT`}3&!v*fg7%6t`|WpjPvQ_i?ehqq=?B3ug?`it`V28ol8<i+
zQs*Me4L}Drr~X}(emxukxSbWt7av}6F_(Bf7a~BXgTF~qAjiK2Pl{%jJU|BhJ%f)G
zR05rR+q&n2GNgxokXn9Z(HR<~aLdOLi#X&2nzK3YO{?cZj8ZY#1Rc4_uqckM?DD;}
z*WNgva0=8S8{)q_`LB<F;s-H5&nciF;@u#ix6%0jfIGFCQ{ohckJB_6Ij7K+>AfJe
zD@lL37sRQJ;*RvquZku7Ux6bXs&a*2iZF%%kN=G#mYI|%n47v;spmq6oP0JjXlsy_
zlECwN&6+(?SP7ghLVal0<kJCX+#w=ddP_?ayW;iygbtB3cMY4WfHs6(evM%EiXxMA
zf+Jm6D(0_%OsG~h#&8a!BO!ca>NTT|RBa$_Pa|>H@vPxzXw$TR)r@~}%F2`sLf4NF
z=|9G>`66GX*;3RGd-vrgnqq6VFLYCqQs6J+1c-WBSw1<#x=4*UP`i&5LgoeOagE-q
z;xq&y6Q@Z+MsnLkAyaNMDYnSdhwYs~mLJ3QQ$LXBV&WW3>W->Fx$B}}Xdgii&;Px1
z){;DzCFkH|Lf`hmk@cjToC}k#I77253pehkD{*9(yl~lLnd*rsD%CBFgapKC@4YLA
z&d^ZAa`TeRoj;*UGC7CFMD0%1JKcx1NtJ$w%ap85qpUVG<(KBs+Bg#1%h1C&{{ddV
B#KHgo

literal 0
HcmV?d00001

diff --git a/experiments/robot/aloha/__pycache__/robot_utils.cpython-310.pyc b/experiments/robot/aloha/__pycache__/robot_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..310595bc89a30af0cb8b09acc3e54c411030c617
GIT binary patch
literal 7982
zcmb7J+mjnvdGC9-E~C+0Jf88cy}K-IRtY-;n}l#NUKa0qF(DHNJ0vJ<b$WDqG?FE?
z=5&uQLZW~%>?YuZOGpA+PR4m+9`Zm@dEnp3Lmti(RlLMid7C#XDf0WiZb>cKW1v<2
zook=-o$q#j=O|iOC~A0q`kxD}E9W)s7gWeUCJLXz8~+$YXhQF3b)8eAZXh)}X4k4)
zI<=V{yX({)&g&hw?uKsLs~6tVgeB~Iny?RzdJ!c@xG1?Om4qh>_q2Lh6h#T?f;cZ0
zMCG1duZTraMa`mE63a-dVnwVXT@q)+8q#HPR;(jk5gXzh(pB+{coyjyp<mZ(7aoCI
zZM&u`<Bc1j%b$%}z2thU*X)Fsqi#3oi3at^Ka-Viqc-GKVuh>16c+d~g)JP6ZwXg;
zNNrINMWl`>i84}``PS-Q#&;p(yC_z`&l$CFwHq|U&xegjicsFT(xB<aPXQSh?`6F4
z1s#IcAy}=ek4y*?;_etD3#m!M9@$)Hk+4S)_Rtt=zB{xJHKZP~*`%7nP?x_qwAw{t
zSQPFX=&7{}Z)k6SapWeYp*D2i(?szBlz50)CN-tUYRco9p)2g0hP;k($|J9L4&?=u
z{|e;=l$TMipxi*Yf^rdM2j!b6FQRND)vxPA>u5<V^4!%2=Fk~>4>3D>xbmF|4@(b>
zp(&Qd%6*-9+uux<kD&gCI++%Ub6Y}4+Oe)|5$+#^(6D#;)~97lG%0Vhtd-XYy8d3P
zxu+H>Cy`71+p-lWs?s8D_;Rd{W4&GodQtCgH#&%w<8!mJ{Wxq;vB#-PqjaL%s=}FY
zHydqWZym%t{-C&(jT+omB|m9(!#D}L{rJ(VDX&g9xYG%HO_cEb{I%MuDm9``Cu}6G
zs258IwaS)J+`n5dG;aAjydFu0p%&w?7e~_X#?5#WM0!-~wPIg{I|ohWq@7h6tc5c0
zdq`qcX!Tl&--|@3+&D~vB#~+<WwNp3(<)dz)tM?<;n~@m$x_w&OhaCzE89_WwcGE6
z-LRL0;!P<dsfyPRcH)L??SxWUBoSrDoiOaz3d)MZ&aQkG%a9vHX!Y_e5n6DzK4OuU
zQeS8Uoz70sxasd6^cu&Sd>Ng7d39?q>V{hh1ijVSySvqwQBwxptuK7;^;cdOK|cxQ
zR(Pi$O3d6#;w>5NM9EgriS~jt?;j+sPQ2N_t32OlVfhpe<Bx!pbx$wj=O8WWmR<o}
z=AVa}{LeDh&}&^czFN@-YghA*zBwLt#^1^JC;t9T6cTvAP=`P2uzanpw~e783}N0k
z*f+<m7FvzA*|ysDOuap+cc`Ad>Gj0R#&jMShsIH1{N*?m?y7cH8~YUdmA5Syz>O;P
zgE&t1WOUHntLccnVce3T@WWms5(uKQf?jfw$TCRt2mkfbf0s6wF4c+>dx$2_fz%gT
z-6ky6Pa?l@>s96Kg#jX8-DamC^~1Gn_AE94<u)@5b%K6B?1`EwU!uW&2SiySNCH-R
zN@`T;jg67I#AT2R(89Xz7@NBFt9ea~J2wn;96zK93ok|9lPZYNN6^3#MOFQwfmmZ~
zOA61&x@0%bnEEiPn6o*a<OMWr;XR>IEtt>Xxmkl4voUeMQ~8m<k0EG&;!FMkkv}9t
zrazT}d=5qGDamLD7oIX3Jp0#AwnRgnI@$AR{UhqZlYQW6lTCugub-2A?({idK<g(a
zbH4PHIdk!-K^w&QAuZc@Nz3u?f+X6ZHqyz{upgF3MgpA=jnw}Rq4}Y4QQLQh1SKGU
z@`;)W8*DWa`3wq@G@RLIy_jm08_5vo^rMQf>vThjUqf?Dk=sEmSwl=2tk14y+I5D1
zvZYD9_$&&95ALNa(6EB_5n{zqPp~3h1>(e^ai~9|jZCbJ)@5Tqb>6ccn1|Ry_ua%e
z)Uv(Q-JY3>oP(yjj`!q)Ac=N_VIS%vgcX7!-!L{P2joY&bufV*<SY=@vAO+-<o@Va
zc#chF^!myUqzvvVs~ZNrV?*g3lH=~~yp#$e@%O?GV)Yf8nRbIBT)C(ZUY!+DUjoVb
zq7^iIQJl0I@#e)&gxw*&v^g0<vbj)b8kC<;)w3(Z{ZIbV+<bKxdYkA_**sj$N<+;X
z_=3H9tW^4Hp3|krICvAE#bm?SD6DlT7F^GH{bS#39~(EW)JnAP&<5xGs_6UONE~!1
zU-td|gP@bPNU~UtJ{(=x;pOvGN1h=+O5`&{rh$QT6eA_+lHylP((oWt@4V-ER{ASD
z+}=Rb%tQY2DZIQ7PM~n*H;2OTn?>d2&@Qnteu$lkH~tR_WX+6r=@^lsT^nhNcIh~w
zixlm;U3cV>-7?Z8THt(<X^Cl>>B7)NYh`H2j|}bhBB5g8G6uGfs>0*Fw!rvPYcDYZ
zoz@po?;<uV&()Vuk3F@$k|EZLjE0sdi-r4e5Pvn7Ka<O^C1;P;MP<Ct))rIjDoFys
zs>SqV0&Eu647J3ix3KZ{vtx=RUDUk^QkAsL4ZN-X0XE6RA1rLO>1g9)R|O0joY`um
z$D25-Z_-$UwfX9pk3w<<gUMBp=8t|z&(B_~ZOApMBnM34*gECtOr+K@Q78eST6g?A
z@f3^xDfJ*e<ts!ssjs!)j{u#+-Q7qgsuakspK+!Zo3aJw<+vine~biQq)!*Kwegbt
z#*SLcDsqFwRDUk3pBs<cs;O9}C|s@OKF^_Zb#CtST$_A9t>*(EwE{;7z?K$3%#%<n
zNk~p5G4(}uLvrD3czO3pk!2M)#Q|;{|16`z=mq$Nac3EbbFeh_p(&8e!<TWizDz@3
z1R3gWatj?P9Rjf_v@N6-wlvMRVZ4rf7SuiRlEP8p;SBsc*0*a#<w0*PI*`ae!+3S?
zR*3N0N)W;Kk(HlErDXKOsQoXg_#q-Bs$(OKquf3xfj>uk3vW!=F-6a`@>N1=>yuT^
zseT4J<QIEF&~UO?t!>C9EXzEy68p%WU6%Y3y4D<JHo8L6wwBu47b!R4T>4a;WBCiH
z+sAw2fp}wPpPOekXIEr;r}8ZS8pCAV$bj-Uh_D>%lzTcie}OKysM`+6;Kg|^=B$`G
z$zvdjH?$5m6;c~TUBVEA93VT5@dD!rl=KumWNj&yuo7H_;9O$AX9)**f>IZej#BS|
zOUQyx5rxwch5Q&MmLDhbNg}TknUne(s2fuKn;?T%rtADoH~v8>^?ly*8BPqAi#a9v
z+%Q-x&WS9oS9%d}fbE)1gf;UPSgF}lkWep9r>Ixw$F8qFY2^8Jeg<QH8E=LhDs}}9
z|B^8{clwIA<yB(DlY{}~9~1c;5e5k4V3JH$lBJxE2>t}E6sP#W#E4)6h`=t}l3HfK
zAlVPab+#GWCdf|cblNrn0CfU@0DZ>xBib!$mNFp;W#Nh|ROS4(B;Q6mZfi{J9dlw(
zX9@F?1k(@3(I<WfpTy+ytGc|2WQH-Sr}s&W=^F%$fkvf34{+sX+K^3@<sK2@r)J1D
zl6qx!;(7iy#{7+SI^pnd@pWv0RDtAW7a7K#GberWI1&y?F}?$WlfIDvnIGzZlX>6}
z2>uW^eeY<y@J0Bf2h0wiJT$l-UiU8Af%2an*$IL<=c`{bM$X6`x^27d46UI9P;PJV
zMJu74{TCAo@c`w>pJ%UhPWQ?aMgUOm+YX_5Os#*{*vH|QWl>Az5A1GieuzajFQVeb
z4ldQOz4tGD2TMF*t!abv-RM?Gs?9<4Y0nvbl6dHVLwMwbj?hiwri4Y}IL;>!9Le6*
z9-2A6u-RNfPvv!5y|^DV!edkRQGvU&11Wu|DXf=pX+Zj;0_?BQJLm!_h5a~P0&mR}
z;lGQQ*}kA%0Vwl!u;<+bM8Z9l3D6^HMamqsaGa4ruNkUBh@o)N20XM&Z7aB5o44Q`
zdhjr0l=vhjRn75~))ZJ!d_7i~@SZg#GH9n+J;5y>25E_$5^iI1Ny|{_ASCU-Lou#`
z5Q10-mZ<9IK&}6i<Wn=|SwIDC3~Zk7B9pLD-iJBjFo|PvW~g)r4vnM+$%bm9#!F0?
z-q=uKoSC6MH*ykS4K~yj=n$ZNXb)YOD4D70<I=Z{e+JrxqvFtH6MZ=;9hDP<^Y6>d
zRd&D+lwr{+XvSJd8{~JqnfH(x+bR2CG_IJ_@sFrpf(%|@d1wCo+Zm^fC!;5n;|-aW
z@G=!A2K645?t;`7Qe;e#p3O+|-ZrA*SBNkWprvwnQpNtkj-R4)<suK0*~=MHL4K9G
zevQc2i7<pXq})9s-yrf$A{5(F2~2HdVP@g~gdQ=uei7C&wv-BPIAJUQE5C(?8L87|
zK(=xjZ;Cas864>ZBZQe9iEtA@|MdtbSEhEC%CHvLL<SPPZRKQ#5L)~VW%Egey@|uK
zS|eA$wXMGs#JGDLhh5rc#GQ^>;mcfH;qOFnif)udGFBUDB{^e+6Go80hv`&Jcj4%O
zzY2T++ZHBFMKe*Fs^Xi2zMtVw_}RbFGAVpl;XhT#M1CKI(<Smjw9@wYm_%@R9ZQ6+
znI;l&x8;3&!z-?v{o7snZ7Q-UkPG9U;_s<~Vv{P4xyQ}PQ`qkYh;Gsc`#;84G3`7V
z^FKiLG3KeA)OkX3$TKgjhd7JpYDlI0Cm&AsdB)Y^wd}ayU-RDzaE1OnVo9#{D%Yle
z*BHE@dQM5KHt4X4z!=K-iA!4}ZQ?B06M=*ow1nTpJn_F1-xLLwap5A#xldi5{71CT
zur0~<9|;io&qT<!<qwHGNw`bI0WMFclmCRevX4JN?y-r-ZgTQ|wC1EZb(7R6xI7Mb
z6JxkCH3aN&WCQig*fS#td%;cze`*jDY{5)wRQv~!|G>+Vsp53SHK#Ht{{?+(PR<f<
zOedHBN+UjLmhYkAXViE^gh~JjpP*z7SNaFt{$1XhId0Wo$o&Je$^N;?|G;cE@TV8l
zn#5^byhdZecr@i+y>>U*i+aiqyE~y!ddSyQj87R0reKS9GY(VinUXA=?Ld-Fc-_oe
wPDB$qDeq!>*LI0CACsCqobc%H9uJ4|icz`Zebh6HmRImfp5xhcUsksMAHMuGDgXcg

literal 0
HcmV?d00001

diff --git a/experiments/robot/aloha/aloha_utils.py b/experiments/robot/aloha/aloha_utils.py
new file mode 100644
index 0000000..002fbbf
--- /dev/null
+++ b/experiments/robot/aloha/aloha_utils.py
@@ -0,0 +1,85 @@
+"""Utils for evaluating policies in real-world ALOHA environments."""
+
+import os
+
+import imageio
+import numpy as np
+from PIL import Image
+
+from experiments.robot.aloha.real_env import make_real_env
+from experiments.robot.robot_utils import (
+    DATE,
+    DATE_TIME,
+)
+
+
+def get_next_task_label(task_label):
+    """Prompt the user to input the next task."""
+    if task_label == "":
+        user_input = ""
+        while user_input == "":
+            user_input = input("Enter the task name: ")
+        task_label = user_input
+    else:
+        user_input = input("Enter the task name (or leave blank to repeat the previous task): ")
+        if user_input == "":
+            pass  # Do nothing -> Let task_label be the same
+        else:
+            task_label = user_input
+    print(f"Task: {task_label}")
+    return task_label
+
+
+def get_aloha_env():
+    """Initializes and returns the ALOHA environment."""
+    env = make_real_env(init_node=True)
+    return env
+
+
+def resize_image_for_preprocessing(img):
+    """
+    Takes numpy array corresponding to a single image and resizes to 256x256, exactly as done
+    in the ALOHA data preprocessing script, which is used before converting the dataset to RLDS.
+    """
+    ALOHA_PREPROCESS_SIZE = 256
+    img = np.array(
+        Image.fromarray(img).resize((ALOHA_PREPROCESS_SIZE, ALOHA_PREPROCESS_SIZE), resample=Image.BICUBIC)
+    )  # BICUBIC is default; specify explicitly to make it clear
+    return img
+
+
+def get_aloha_image(obs):
+    """Extracts third-person image from observations and preprocesses it."""
+    # obs: dm_env._environment.TimeStep
+    img = obs.observation["images"]["cam_high"]
+    img = resize_image_for_preprocessing(img)
+    return img
+
+
+def get_aloha_wrist_images(obs):
+    """Extracts both wrist camera images from observations and preprocesses them."""
+    # obs: dm_env._environment.TimeStep
+    # left_wrist_img = obs.observation["images"]["cam_left_wrist"]
+    right_wrist_img = obs.observation["images"]["cam_right_wrist"]
+    # left_wrist_img = resize_image_for_preprocessing(left_wrist_img)
+    right_wrist_img = resize_image_for_preprocessing(right_wrist_img)
+    return right_wrist_img
+
+
+def save_rollout_video(rollout_images, idx, success, task_description, log_file=None, notes=None):
+    """Saves an MP4 replay of an episode."""
+    rollout_dir = f"./rollouts/{DATE}"
+    os.makedirs(rollout_dir, exist_ok=True)
+    processed_task_description = task_description.lower().replace(" ", "_").replace("\n", "_").replace(".", "_")[:50]
+    filetag = f"{rollout_dir}/{DATE_TIME}--openvla_oft--episode={idx}--success={success}--task={processed_task_description}"
+    if notes is not None:
+        filetag += f"--{notes}"
+    mp4_path = f"{filetag}.mp4"
+    video_writer = imageio.get_writer(mp4_path, fps=25)
+    for img in rollout_images:
+        video_writer.append_data(img)
+    video_writer.close()
+    print(f"Saved rollout MP4 at path {mp4_path}")
+    if log_file is not None:
+        log_file.write(f"Saved rollout MP4 at path {mp4_path}\n")
+    return mp4_path
diff --git a/experiments/robot/aloha/constants.py b/experiments/robot/aloha/constants.py
new file mode 100755
index 0000000..b0c0ea5
--- /dev/null
+++ b/experiments/robot/aloha/constants.py
@@ -0,0 +1,128 @@
+### Task parameters
+import pathlib
+import os
+
+# # DATA_DIR = os.path.expanduser('~/data')
+# DATA_DIR ='/home/test/data/aloha_real'
+#
+# TASK_CONFIGS = {
+#
+#     'single_test': {
+#         'dataset_dir': DATA_DIR + '/single_test',
+#         'episode_len': 300,
+#         'camera_names': ['cam_arm_wrist']
+#     },
+#
+# }
+#
+# ### ALOHA fixed constants
+# DT = 0.02
+# FPS = 50
+# JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
+# START_ARM_POSE = [0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239,  0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239]
+#
+# XML_DIR = str(pathlib.Path(__file__).parent.resolve()) + '/assets/' # note: absolute path
+#
+# # Left finger position limits (qpos[7]), right_finger = -1 * left_finger
+# MASTER_GRIPPER_POSITION_OPEN = 0.02417
+# MASTER_GRIPPER_POSITION_CLOSE = 0.01244
+# PUPPET_GRIPPER_POSITION_OPEN = 0.05800
+# PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
+#
+# # Gripper joint limits (qpos[6])
+# MASTER_GRIPPER_JOINT_OPEN = 0.3083  # noetic
+# MASTER_GRIPPER_JOINT_CLOSE = -0.6842  # noetic
+# PUPPET_GRIPPER_JOINT_OPEN = 1.4910
+# PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
+#
+# ############################ Helper functions ############################
+#
+# MASTER_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_POSITION_CLOSE) / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+# PUPPET_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_POSITION_CLOSE) / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+# MASTER_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
+# PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
+# MASTER2PUPPET_POSITION_FN = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(MASTER_GRIPPER_POSITION_NORMALIZE_FN(x))
+#
+# MASTER_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+# PUPPET_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+# MASTER_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+# PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+# MASTER2PUPPET_JOINT_FN = lambda x: PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(MASTER_GRIPPER_JOINT_NORMALIZE_FN(x))
+#
+# MASTER_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+# PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+#
+# MASTER_POS2JOINT = lambda x: MASTER_GRIPPER_POSITION_NORMALIZE_FN(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+# MASTER_JOINT2POS = lambda x: MASTER_GRIPPER_POSITION_UNNORMALIZE_FN((x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE))
+# PUPPET_POS2JOINT = lambda x: PUPPET_GRIPPER_POSITION_NORMALIZE_FN(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+# PUPPET_JOINT2POS = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN((x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE))
+#
+# MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE)/2
+#
+# ### Task parameters
+
+DATA_DIR ='/home/test/data/aloha_real'
+
+TASK_CONFIGS = {
+
+    'put_capybara_into_the_box': {
+        'dataset_dir': DATA_DIR + '/put_capybara_into_the_box',
+        'episode_len': 500,
+        'camera_names': ['cam_high', 'cam_right_wrist']
+    },
+
+    'put_NAILONG_into_the_box': {
+        'dataset_dir': DATA_DIR + '/put_NAILONG_into_the_box',
+        'episode_len': 500,
+        'camera_names': ['cam_high', 'cam_right_wrist']
+    },
+
+    'put_Banana_into_the_box': {
+        'dataset_dir': DATA_DIR + '/put_NAILONG_into_the_box',
+        'episode_len': 500,
+        'camera_names': ['cam_high', 'cam_right_wrist']
+    },
+
+}
+
+### ALOHA fixed constants
+DT = 0.02
+FPS = 50
+JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
+START_ARM_POSE = [0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239,  0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239]
+
+# Left finger position limits (qpos[7]), right_finger = -1 * left_finger
+MASTER_GRIPPER_POSITION_OPEN = 0.02417
+MASTER_GRIPPER_POSITION_CLOSE = 0.01244
+PUPPET_GRIPPER_POSITION_OPEN = 0.05800
+PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
+
+# Gripper joint limits (qpos[6])
+MASTER_GRIPPER_JOINT_OPEN = 0.3083
+MASTER_GRIPPER_JOINT_CLOSE = -0.6842
+PUPPET_GRIPPER_JOINT_OPEN = 1.4910
+PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
+
+############################ Helper functions ############################
+
+MASTER_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_POSITION_CLOSE) / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+PUPPET_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_POSITION_CLOSE) / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+MASTER_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
+PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
+MASTER2PUPPET_POSITION_FN = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(MASTER_GRIPPER_POSITION_NORMALIZE_FN(x))
+
+MASTER_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+PUPPET_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+MASTER_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN = lambda x: x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+MASTER2PUPPET_JOINT_FN = lambda x: PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(MASTER_GRIPPER_JOINT_NORMALIZE_FN(x))
+
+MASTER_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+
+MASTER_POS2JOINT = lambda x: MASTER_GRIPPER_POSITION_NORMALIZE_FN(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+MASTER_JOINT2POS = lambda x: MASTER_GRIPPER_POSITION_UNNORMALIZE_FN((x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE))
+PUPPET_POS2JOINT = lambda x: PUPPET_GRIPPER_POSITION_NORMALIZE_FN(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+PUPPET_JOINT2POS = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN((x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE))
+
+MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE)/2
diff --git a/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_14_50.txt b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_14_50.txt
new file mode 100644
index 0000000..2b22f32
--- /dev/null
+++ b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_14_50.txt
@@ -0,0 +1,7 @@
+Loading local VLA model...
+
+Task: put_capybara_into_the_box
+Starting episode 1...
+Prepare the scene, and then press Enter to begin...
+
+Caught exception: RealEnv.get_observation() got an unexpected keyword argument 't'
diff --git a/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_18_25.txt b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_18_25.txt
new file mode 100644
index 0000000..eb151ea
--- /dev/null
+++ b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_18_25.txt
@@ -0,0 +1 @@
+Loading local VLA model...
diff --git a/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_20_19.txt b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_20_19.txt
new file mode 100644
index 0000000..2b22f32
--- /dev/null
+++ b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_20_19.txt
@@ -0,0 +1,7 @@
+Loading local VLA model...
+
+Task: put_capybara_into_the_box
+Starting episode 1...
+Prepare the scene, and then press Enter to begin...
+
+Caught exception: RealEnv.get_observation() got an unexpected keyword argument 't'
diff --git a/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_23_06.txt b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_23_06.txt
new file mode 100644
index 0000000..e770282
--- /dev/null
+++ b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_23_06.txt
@@ -0,0 +1,5 @@
+Loading local VLA model...
+
+Task: put_capybara_into_the_box
+Starting episode 1...
+Prepare the scene, and then press Enter to begin...
diff --git a/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_27_04.txt b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_27_04.txt
new file mode 100644
index 0000000..eb151ea
--- /dev/null
+++ b/experiments/robot/aloha/experiments/logs/EVAL-LOCAL-openvla-2025_11_08-15_27_04.txt
@@ -0,0 +1 @@
+Loading local VLA model...
diff --git a/experiments/robot/aloha/preprocess_split_aloha_data.py b/experiments/robot/aloha/preprocess_split_aloha_data.py
new file mode 100644
index 0000000..8de07f2
--- /dev/null
+++ b/experiments/robot/aloha/preprocess_split_aloha_data.py
@@ -0,0 +1,260 @@
+"""
+Preprocesses ALOHA dataset(s) and splits them into train/val sets.
+
+Preprocessing includes downsizing images from 480x640 to 256x256.
+Splits happen at the episode level (not step level), which means that
+an episode is treated as an atomic unit that entirely goes to either
+the train set or val set.
+
+Original ALOHA data layout:
+    /PATH/TO/DATASET/dataset_name/
+        - episode_0.hdf5
+        - episode_1.hdf5
+        - ...
+        - episode_N.hdf5
+
+Preprocessed data layout (after running this script):
+    /PATH/TO/PREPROCESSED_DATASETS/dataset_name/
+        - train/
+            - episode_0.hdf5
+            - episode_1.hdf5
+            - ...
+            - episode_M.hdf5
+        - val/
+            - episode_0.hdf5
+            - episode_1.hdf5
+            - ...
+            - episode_K.hdf5
+
+    where N > M > K
+
+Example usage:
+    # "put X into pot" task
+    python experiments/robot/aloha/preprocess_split_aloha_data.py \
+        --dataset_path /scr/moojink/data/aloha1_raw/put_green_pepper_into_pot/ \
+        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
+        --percent_val 0.05 && \
+    python experiments/robot/aloha/preprocess_split_aloha_data.py \
+        --dataset_path /scr/moojink/data/aloha1_raw/put_red_pepper_into_pot/ \
+        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
+        --percent_val 0.05 && \
+    python experiments/robot/aloha/preprocess_split_aloha_data.py \
+        --dataset_path /scr/moojink/data/aloha1_raw/put_yellow_corn_into_pot/ \
+        --out_base_dir /scr/moojink/data/aloha1_preprocessed/ \
+        --percent_val 0.05
+"""
+
+import argparse
+import glob
+import os
+import random
+
+import h5py
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+
+def load_hdf5(demo_path):
+    """Loads single episode."""
+    if not os.path.isfile(demo_path):
+        print(f"Dataset does not exist at \n{demo_path}\n")
+        exit()
+
+    print(f"Loading {demo_path}...")
+    with h5py.File(demo_path, "r") as root:
+        is_sim = root.attrs["sim"]
+        qpos = root["/observations/qpos"][()]
+        qvel = root["/observations/qvel"][()]
+        effort = root["/observations/effort"][()]
+        action = root["/action"][()]
+        image_dict = dict()
+        for cam_name in root["/observations/images/"].keys():
+            image_dict[cam_name] = root[f"/observations/images/{cam_name}"][()]
+    print(f"Loading episode complete: {demo_path}")
+
+    return qpos, qvel, effort, action, image_dict, is_sim
+
+
+def load_and_preprocess_all_episodes(demo_paths, out_dataset_dir):
+    """
+    Loads and preprocesses all episodes.
+    Resizes all images in one episode before loading the next, to reduce memory usage.
+    """
+    cam_names = ["cam_high", "cam_left_wrist", "cam_right_wrist"]
+    idx = 0
+    for demo in tqdm(demo_paths):
+        qpos, qvel, effort, action, image_dict, is_sim = load_hdf5(demo)
+        # Save non-image info
+        episode_len = image_dict["cam_high"].shape[0]
+        # Resize all images
+        print("Resizing images in episode...")
+        for k in cam_names:
+            resized_images = []
+            for i in range(episode_len):
+                resized_images.append(
+                    np.array(
+                        Image.fromarray(image_dict[k][i]).resize(
+                            (args.img_resize_size, args.img_resize_size), resample=Image.BICUBIC
+                        )
+                    )  # BICUBIC is default; specify explicitly to make it clear
+                )
+            image_dict[k] = np.stack(resized_images)
+        print("Resizing images in episode complete!")
+        # Save preprocessed episode
+        data_dict = dict(
+            qpos=qpos,
+            qvel=qvel,
+            effort=effort,
+            action=action,
+            image_dict=image_dict,
+            is_sim=is_sim,
+        )
+        save_new_hdf5(out_dataset_dir, data_dict, idx)
+        idx += 1
+
+
+def randomly_split(full_qpos, full_qvel, full_effort, full_action, full_image_dict, percent_val):
+    """Randomly splits dataset into train and validation sets."""
+    # Create a list of episode indices
+    num_episodes_total = len(full_qpos)
+    indices = list(range(num_episodes_total))
+    # Shuffle the episode indices
+    random.shuffle(indices)
+    # Create new lists using the shuffled indices
+    shuffled_qpos = [full_qpos[idx] for idx in indices]
+    shuffled_qvel = [full_qvel[idx] for idx in indices]
+    shuffled_effort = [full_effort[idx] for idx in indices]
+    shuffled_action = [full_action[idx] for idx in indices]
+    shuffled_image_dict = {
+        "cam_high": [],
+        "cam_left_wrist": [],
+        "cam_right_wrist": [],
+    }
+    for k in full_image_dict.keys():
+        shuffled_image_dict[k] = [full_image_dict[k][idx] for idx in indices]
+    # Split into train and val sets
+    num_episodes_val = int(num_episodes_total * percent_val)
+    print(f"Total # steps: {num_episodes_total}; using {num_episodes_val} ({percent_val:.2f}%) for val set")
+    num_episodes_train = num_episodes_total - num_episodes_val
+    train_dict = dict(
+        qpos=shuffled_qpos[:num_episodes_train],
+        qvel=shuffled_qvel[:num_episodes_train],
+        effort=shuffled_effort[:num_episodes_train],
+        action=shuffled_action[:num_episodes_train],
+        image_dict=dict(
+            cam_high=shuffled_image_dict["cam_high"][:num_episodes_train],
+            cam_left_wrist=shuffled_image_dict["cam_left_wrist"][:num_episodes_train],
+            cam_right_wrist=shuffled_image_dict["cam_right_wrist"][:num_episodes_train],
+        ),
+    )
+    val_dict = dict(
+        qpos=shuffled_qpos[num_episodes_train:],
+        qvel=shuffled_qvel[num_episodes_train:],
+        effort=shuffled_effort[num_episodes_train:],
+        action=shuffled_action[num_episodes_train:],
+        image_dict=dict(
+            cam_high=shuffled_image_dict["cam_high"][num_episodes_train:],
+            cam_left_wrist=shuffled_image_dict["cam_left_wrist"][num_episodes_train:],
+            cam_right_wrist=shuffled_image_dict["cam_right_wrist"][num_episodes_train:],
+        ),
+    )
+    return train_dict, val_dict
+
+
+def save_new_hdf5(out_dataset_dir, data_dict, episode_idx):
+    """Saves an HDF5 file for a new episode."""
+    camera_names = data_dict["image_dict"].keys()
+    H, W, C = data_dict["image_dict"]["cam_high"][0].shape
+    out_path = os.path.join(out_dataset_dir, f"episode_{episode_idx}.hdf5")
+    # Save HDF5 with same structure as original demos (except that now we combine all episodes into one HDF5 file)
+    with h5py.File(
+        out_path, "w", rdcc_nbytes=1024**2 * 2
+    ) as root:  # Magic constant for rdcc_nbytes comes from ALOHA codebase
+        episode_len = data_dict["qpos"].shape[0]
+        root.attrs["sim"] = data_dict["is_sim"]
+        obs = root.create_group("observations")
+        _ = obs.create_dataset("qpos", (episode_len, 14))
+        _ = obs.create_dataset("qvel", (episode_len, 14))
+        _ = obs.create_dataset("effort", (episode_len, 14))
+        root["/observations/qpos"][...] = data_dict["qpos"]
+        root["/observations/qvel"][...] = data_dict["qvel"]
+        root["/observations/effort"][...] = data_dict["effort"]
+        image = obs.create_group("images")
+        for cam_name in camera_names:
+            _ = image.create_dataset(
+                cam_name,
+                (episode_len, H, W, C),
+                dtype="uint8",
+                chunks=(1, H, W, C),
+            )
+            root[f"/observations/images/{cam_name}"][...] = data_dict["image_dict"][cam_name]
+        _ = root.create_dataset("action", (episode_len, 14))
+        root["/action"][...] = data_dict["action"]
+        # Compute and save *relative* actions as well
+        actions = data_dict["action"]
+        relative_actions = np.zeros_like(actions)
+        relative_actions[:-1] = actions[1:] - actions[:-1]  # Relative actions are the changes in joint pos
+        relative_actions[-1] = relative_actions[-2]  # Just copy the second-to-last action for the last action
+        _ = root.create_dataset("relative_action", (episode_len, 14))
+        root["/relative_action"][...] = relative_actions
+    print(f"Saved dataset: {out_path}")
+
+
+def main(args):
+    # Create directory to save preprocessed dataset (if it doesn't exist already)
+    os.makedirs(args.out_base_dir, exist_ok=True)
+    out_dataset_dir = os.path.join(args.out_base_dir, os.path.basename(args.dataset_path.rstrip("/")))
+    os.makedirs(out_dataset_dir, exist_ok=True)
+    # Get list of filepaths of all episodes
+    all_demo_paths = glob.glob(os.path.join(args.dataset_path, "*.hdf5"))  # List of HDF5 filepaths
+    all_demo_paths.sort()
+    # Create a list of episode indices
+    num_episodes_total = len(all_demo_paths)
+    indices = list(range(num_episodes_total))
+    # Shuffle the episode indices
+    random.shuffle(indices)
+    # Split into train and val sets
+    num_episodes_val = int(num_episodes_total * args.percent_val)
+    print(f"Total # episodes: {num_episodes_total}; using {num_episodes_val} ({args.percent_val:.2f}%) for val set")
+    num_episodes_train = num_episodes_total - num_episodes_val
+    train_indices = indices[:num_episodes_train]
+    val_indices = indices[num_episodes_train:]
+    train_demo_paths = [all_demo_paths[i] for i in train_indices]
+    val_demo_paths = [all_demo_paths[i] for i in val_indices]
+    # Preprocess all episodes and save the result
+    out_dataset_dir_train = os.path.join(out_dataset_dir, "train")
+    out_dataset_dir_val = os.path.join(out_dataset_dir, "val")
+    os.makedirs(out_dataset_dir_train, exist_ok=True)
+    os.makedirs(out_dataset_dir_val, exist_ok=True)
+    load_and_preprocess_all_episodes(train_demo_paths, out_dataset_dir_train)
+    load_and_preprocess_all_episodes(val_demo_paths, out_dataset_dir_val)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset_path",
+        required=True,
+        help="Path to raw ALOHA dataset directory. Example: /PATH/TO/USER/data/aloha_raw/put_green_pepper_into_pot/",
+    )
+    parser.add_argument(
+        "--out_base_dir",
+        required=True,
+        help="Path to directory in which to save preprocessed dataset. Example: /PATH/TO/USER/data/aloha_preprocessed/",
+    )
+    parser.add_argument(
+        "--percent_val",
+        type=float,
+        help="Percent of dataset to use as validation set (measured in episodes, not steps).",
+        default=0.05,
+    )
+    parser.add_argument(
+        "--img_resize_size",
+        type=int,
+        help="Size to resize images to. Final images will be square (img_resize_size x img_resize_size pixels).",
+        default=256,
+    )
+    args = parser.parse_args()
+
+    main(args)
diff --git a/experiments/robot/aloha/real_env.py b/experiments/robot/aloha/real_env.py
new file mode 100644
index 0000000..ddc6b66
--- /dev/null
+++ b/experiments/robot/aloha/real_env.py
@@ -0,0 +1,242 @@
+import time
+import numpy as np
+import collections
+import matplotlib.pyplot as plt
+import dm_env
+from pyquaternion import Quaternion
+
+from constants import DT, START_ARM_POSE, MASTER_GRIPPER_JOINT_NORMALIZE_FN, PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN
+from constants import PUPPET_GRIPPER_POSITION_NORMALIZE_FN, PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN
+from constants import PUPPET_GRIPPER_JOINT_OPEN, PUPPET_GRIPPER_JOINT_CLOSE
+from robot_utils import Recorder, ImageRecorder
+
+# from base_recorder import BaseRecorder
+# from scan_recorder import SCANRecorder
+# from imu_recorder import IMURecorder
+
+from robot_utils import setup_master_bot, setup_puppet_bot, move_arms, move_grippers
+from interbotix_xs_modules.arm import InterbotixManipulatorXS
+from interbotix_xs_msgs.msg import JointSingleCommand
+# import pyrealsense2 as rs
+# from dynamixel_client import DynamixelClient
+
+import IPython
+e = IPython.embed
+
+class RealEnv:
+    """
+    Environment for real robot bi-manual manipulation
+    Action space:      [right_arm_qpos (6),             # absolute joint position
+                        right_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
+                        right_arm_qpos (6),            # absolute joint position
+                        right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
+
+    Observation space: {"qpos": Concat[ right_arm_qpos (6),          # absolute joint position
+                                        right_gripper_position (1),  # normalized gripper position (0: close, 1: open)
+                                        right_arm_qpos (6),         # absolute joint position
+                                        right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
+                        "qvel": Concat[ right_arm_qvel (6),         # absolute joint velocity (rad)
+                                        right_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
+                                        right_arm_qvel (6),         # absolute joint velocity (rad)
+                                        right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
+                        "images": {"cam_high": (480x640x3),        # h, w, c, dtype='uint8'
+                                   "cam_low": (480x640x3),         # h, w, c, dtype='uint8'
+                                   "cam_right_wrist": (480x640x3),  # h, w, c, dtype='uint8'
+                                   "cam_right_wrist": (480x640x3)} # h, w, c, dtype='uint8'
+    """
+
+    def __init__(self, init_node, setup_robots=True, setup_base=False):
+        # self.puppet_bot_right = InterbotixManipulatorXS(robot_model="wx250s", group_name="arm", gripper_name="gripper",
+        #                                                robot_name=f'puppet_right', init_node=init_node)
+        self.puppet_bot_right = InterbotixManipulatorXS(robot_model="vx300s", group_name="arm", gripper_name="gripper",
+                                                        robot_name=f'puppet_right', init_node=init_node)
+
+        if setup_robots:
+            self.setup_robots()
+
+        # #if setup_base:
+        #     self.setup_base()
+
+
+        self.recorder_right = Recorder('right', init_node=False)
+        # self.base_recorder = BaseRecorder(init_node=False)
+        # self.scan_recorder = SCANRecorder(init_node=False)  # 雷达scan
+        # self.imu_recorder = IMURecorder(init_node=False)  # imu
+        self.image_recorder = ImageRecorder(init_node=False)
+        self.gripper_command = JointSingleCommand(name="gripper")
+
+    def setup_robots(self):
+        setup_puppet_bot(self.puppet_bot_right)
+
+    def get_qpos(self):
+        right_qpos_raw = self.recorder_right.qpos
+        right_arm_qpos = right_qpos_raw[:6]
+        right_gripper_qpos = [PUPPET_GRIPPER_POSITION_NORMALIZE_FN(right_qpos_raw[7])]  # this is position not joint
+        return np.concatenate([right_arm_qpos, right_gripper_qpos])
+
+    def get_qvel(self):
+        right_qvel_raw = self.recorder_right.qvel
+        right_arm_qvel = right_qvel_raw[:6]
+        right_gripper_qvel = [PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(right_qvel_raw[7])]
+        return np.concatenate([right_arm_qvel, right_gripper_qvel])
+
+    def get_effort(self):
+        right_effort_raw = self.recorder_right.effort
+        right_robot_effort = right_effort_raw[:7]
+        return np.concatenate([right_robot_effort])
+
+    # cam
+    def get_images(self):
+        return self.image_recorder.get_images()  # noetic
+
+    # -------------------------------------------
+    # def get_base_vel(self):
+    #     return self.base_recorder.get_vel()
+
+    # 雷达scan
+    # def get_scan_vel(self):
+    #     return self.scan_recorder.get_scan_vel()
+
+    # 雷达scan
+    # def get_imu_vel(self):
+    #     return self.imu_recorder.get_imu_vel()
+
+    # def get_tracer_vel(self):
+    #     linear_vel, angular_vel = self.tracer.GetLinearVelocity(), self.tracer.GetAngularVelocity()
+    #     return np.array([linear_vel, angular_vel])
+
+
+    def set_gripper_pose(self, right_gripper_desired_pos_normalized):
+        right_gripper_desired_joint = PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(right_gripper_desired_pos_normalized)
+        self.gripper_command.cmd = right_gripper_desired_joint
+        self.puppet_bot_right.gripper.core.pub_single.publish(self.gripper_command)
+
+    def _reset_joints(self):
+        reset_position = START_ARM_POSE[:6]
+        move_arms([self.puppet_bot_right], [reset_position], move_time=1)
+
+    def _reset_gripper(self):
+        """Set to position mode and do position resets: first open then close. Then change back to PWM mode"""
+        move_grippers([self.puppet_bot_right], [PUPPET_GRIPPER_JOINT_OPEN], move_time=0.5)
+        move_grippers([self.puppet_bot_right], [PUPPET_GRIPPER_JOINT_CLOSE] , move_time=1)
+
+    def _get_obs(self):
+        obs = collections.OrderedDict()
+        obs['qpos'] = self.get_qpos()
+        obs['qvel'] = self.get_qvel()
+        obs['effort'] = self.get_effort()
+        obs['images'] = self.get_images()
+        return obs
+
+    def get_observation(self, t=0):
+        step_type = dm_env.StepType.FIRST if t == 0 else dm_env.StepType.MID
+        return dm_env.TimeStep(
+            step_type=step_type,
+            reward=self.get_reward(),
+            discount=None,
+            observation=self._get_obs()
+        )
+
+    def get_reward(self):
+        return 0
+
+    def reset(self, fake=False):
+        if not fake:
+            # Reboot puppet robot gripper motors
+            self.puppet_bot_right.dxl.robot_reboot_motors("single", "gripper", True)
+            self._reset_joints()
+            self._reset_gripper()
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.FIRST,
+            reward=self.get_reward(),
+            discount=None,
+            observation=self.get_observation())
+
+    def step(self, action, base_action=None, get_tracer_vel=False, get_obs=True):
+        state_len = int(len(action))
+        right_action = action[:state_len]
+
+        self.puppet_bot_right.arm.set_joint_positions(right_action[:6], blocking=False)
+        self.set_gripper_pose(right_action[-1])
+        #if base_action is not None:
+            # linear_vel_limit = 1.5
+            # angular_vel_limit = 1.5
+            # base_action_linear = np.clip(base_action[0], -linear_vel_limit, linear_vel_limit)
+            # base_action_angular = np.clip(base_action[1], -angular_vel_limit, angular_vel_limit)
+            # base_action_linear, base_action_angular = base_action
+            # self.tracer.SetMotionCommand(linear_vel=base_action_linear, angular_vel=base_action_angular)
+        # time.sleep(DT)
+        if get_obs:
+            obs = self.get_observation(get_tracer_vel)
+        else:
+            obs = None
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.MID,
+            reward=self.get_reward(),
+            discount=None,
+            observation=obs)
+
+def get_action(master_bot_right):
+    action = np.zeros(7) # 6 joint + 1 gripper, for two arms
+    # Arm actions
+    action[:6] = master_bot_right.dxl.joint_states.position[:6]
+    # Gripper actions
+    action[6] = MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_right.dxl.joint_states.position[6])
+
+    return action
+
+# def get_base_action():
+
+
+
+def make_real_env(init_node, setup_robots=True, setup_base=False):
+    env = RealEnv(init_node, setup_robots, setup_base)
+    return env
+
+
+def test_real_teleop():
+    """
+    Test bimanual teleoperation and show image observations onscreen.
+    It first reads joint poses from both master arms.
+    Then use it as actions to step the environment.
+    The environment returns full observations including images.
+
+    An alternative approach is to have separate scripts for teleoperation and observation recording.
+    This script will result in higher fidelity (obs, action) pairs
+    """
+
+    onscreen_render = True
+    render_cam = 'cam_right_wrist'
+
+    # source of data
+    master_bot_right = InterbotixManipulatorXS(robot_model="wx250s", group_name="arm", gripper_name="gripper",
+                                              robot_name=f'master_right', init_node=True)
+
+    setup_master_bot(master_bot_right)
+
+
+    # setup the environment
+    env = make_real_env(init_node=False)
+    ts = env.reset(fake=True)
+    episode = [ts]
+    # setup visualization
+    if onscreen_render:
+        ax = plt.subplot()
+        plt_img = ax.imshow(ts.observation['images'][render_cam])
+        plt.ion()
+
+    for t in range(1000):
+        action = get_action(master_bot_right)
+        ts = env.step(action)
+        episode.append(ts)
+
+        if onscreen_render:
+            plt_img.set_data(ts.observation['images'][render_cam])
+            plt.pause(DT)
+        else:
+            time.sleep(DT)
+
+
+if __name__ == '__main__':
+    test_real_teleop()
+
diff --git a/experiments/robot/aloha/requirements_aloha.txt b/experiments/robot/aloha/requirements_aloha.txt
new file mode 100644
index 0000000..c84c6d0
--- /dev/null
+++ b/experiments/robot/aloha/requirements_aloha.txt
@@ -0,0 +1,26 @@
+numpy<2
+draccus
+torchvision
+torch
+pyquaternion
+pyyaml
+rospkg
+pexpect
+mujoco==2.3.7
+dm_control==1.0.14
+opencv-python
+matplotlib
+einops
+packaging
+h5py
+traitlets
+ipdb
+IPython
+modern_robotics
+Pillow
+termcolor
+imageio[ffmpeg]
+uvicorn
+fastapi
+requests
+json_numpy
diff --git a/experiments/robot/aloha/robot_utils.py b/experiments/robot/aloha/robot_utils.py
new file mode 100644
index 0000000..3db3008
--- /dev/null
+++ b/experiments/robot/aloha/robot_utils.py
@@ -0,0 +1,187 @@
+import numpy as np
+import time
+from experiments.robot.aloha.constants import DT
+from interbotix_xs_msgs.msg import JointSingleCommand
+
+import IPython
+e = IPython.embed
+
+class ImageRecorder:
+    def __init__(self, init_node=True, is_debug=False):
+        from collections import deque
+        import rospy
+        from cv_bridge import CvBridge
+        from sensor_msgs.msg import Image
+        self.is_debug = is_debug
+        self.bridge = CvBridge()
+        self.camera_names = ['cam_high', 'cam_right_wrist']
+        if init_node:
+            rospy.init_node('image_recorder', anonymous=True)
+        for cam_name in self.camera_names:
+            setattr(self, f'{cam_name}_image', None)
+            setattr(self, f'{cam_name}_secs', None)
+            setattr(self, f'{cam_name}_nsecs', None)
+            if cam_name == 'cam_high':
+                callback_func = self.image_cb_cam_high
+            elif cam_name == 'cam_low':
+                callback_func = self.image_cb_cam_low
+            elif cam_name == 'cam_left_wrist':
+                callback_func = self.image_cb_cam_left_wrist
+            elif cam_name == 'cam_right_wrist':
+                callback_func = self.image_cb_cam_right_wrist
+            else:
+                raise NotImplementedError
+            rospy.Subscriber(f"/usb_{cam_name}/image_raw", Image, callback_func)
+            if self.is_debug:
+                setattr(self, f'{cam_name}_timestamps', deque(maxlen=50))
+        time.sleep(0.5)
+
+    def image_cb(self, cam_name, data):
+        setattr(self, f'{cam_name}_image', self.bridge.imgmsg_to_cv2(data, desired_encoding='passthrough'))
+        setattr(self, f'{cam_name}_secs', data.header.stamp.secs)
+        setattr(self, f'{cam_name}_nsecs', data.header.stamp.nsecs)
+        # cv2.imwrite('/home/tonyzhao/Desktop/sample.jpg', cv_image)
+        if self.is_debug:
+            getattr(self, f'{cam_name}_timestamps').append(data.header.stamp.secs + data.header.stamp.secs * 1e-9)
+
+    def image_cb_cam_high(self, data):
+        cam_name = 'cam_high'
+        return self.image_cb(cam_name, data)
+
+    def image_cb_cam_low(self, data):
+        cam_name = 'cam_low'
+        return self.image_cb(cam_name, data)
+
+    def image_cb_cam_left_wrist(self, data):
+        cam_name = 'cam_left_wrist'
+        return self.image_cb(cam_name, data)
+
+    def image_cb_cam_right_wrist(self, data):
+        cam_name = 'cam_right_wrist'
+        return self.image_cb(cam_name, data)
+
+    def get_images(self):
+        image_dict = dict()
+        for cam_name in self.camera_names:
+            image_dict[cam_name] = getattr(self, f'{cam_name}_image')
+        return image_dict
+
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+        for cam_name in self.camera_names:
+            image_freq = 1 / dt_helper(getattr(self, f'{cam_name}_timestamps'))
+            print(f'{cam_name} {image_freq=:.2f}')
+        print()
+
+class Recorder:
+    def __init__(self, side, init_node=True, is_debug=False):
+        from collections import deque
+        import rospy
+        from sensor_msgs.msg import JointState
+        from interbotix_xs_msgs.msg import JointGroupCommand, JointSingleCommand
+
+        self.secs = None
+        self.nsecs = None
+        self.qpos = None
+        self.effort = None
+        self.arm_command = None
+        self.gripper_command = None
+        self.is_debug = is_debug
+
+        if init_node:
+            rospy.init_node('recorder', anonymous=True)
+        rospy.Subscriber(f"/puppet_{side}/joint_states", JointState, self.puppet_state_cb)
+        rospy.Subscriber(f"/puppet_{side}/commands/joint_group", JointGroupCommand, self.puppet_arm_commands_cb)
+        rospy.Subscriber(f"/puppet_{side}/commands/joint_single", JointSingleCommand, self.puppet_gripper_commands_cb)
+        if self.is_debug:
+            self.joint_timestamps = deque(maxlen=50)
+            self.arm_command_timestamps = deque(maxlen=50)
+            self.gripper_command_timestamps = deque(maxlen=50)
+        time.sleep(0.1)
+
+    def puppet_state_cb(self, data):
+        self.qpos = data.position
+        self.qvel = data.velocity
+        self.effort = data.effort
+        self.data = data
+        if self.is_debug:
+            self.joint_timestamps.append(time.time())
+
+    def puppet_arm_commands_cb(self, data):
+        self.arm_command = data.cmd
+        if self.is_debug:
+            self.arm_command_timestamps.append(time.time())
+
+    def puppet_gripper_commands_cb(self, data):
+        self.gripper_command = data.cmd
+        if self.is_debug:
+            self.gripper_command_timestamps.append(time.time())
+
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+
+        joint_freq = 1 / dt_helper(self.joint_timestamps)
+        arm_command_freq = 1 / dt_helper(self.arm_command_timestamps)
+        gripper_command_freq = 1 / dt_helper(self.gripper_command_timestamps)
+
+        print(f'{joint_freq=:.2f}\n{arm_command_freq=:.2f}\n{gripper_command_freq=:.2f}\n')
+
+def get_arm_joint_positions(bot):
+    return bot.arm.core.joint_states.position[:6]
+
+def get_arm_gripper_positions(bot):
+    joint_position = bot.gripper.core.joint_states.position[6]
+    return joint_position
+
+def move_arms(bot_list, target_pose_list, move_time=1):
+    num_steps = int(move_time / DT)
+    curr_pose_list = [get_arm_joint_positions(bot) for bot in bot_list]
+    traj_list = [np.linspace(curr_pose, target_pose, num_steps) for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)]
+    for t in range(num_steps):
+        for bot_id, bot in enumerate(bot_list):
+            bot.arm.set_joint_positions(traj_list[bot_id][t], blocking=False)
+        time.sleep(DT)
+
+def move_grippers(bot_list, target_pose_list, move_time):
+    gripper_command = JointSingleCommand(name="gripper")
+    num_steps = int(move_time / DT)
+    curr_pose_list = [get_arm_gripper_positions(bot) for bot in bot_list]
+    traj_list = [np.linspace(curr_pose, target_pose, num_steps) for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)]
+    for t in range(num_steps):
+        for bot_id, bot in enumerate(bot_list):
+            gripper_command.cmd = traj_list[bot_id][t]
+            bot.gripper.core.pub_single.publish(gripper_command)
+        time.sleep(DT)
+
+def setup_puppet_bot(bot):
+    bot.dxl.robot_reboot_motors("single", "gripper", True)
+    bot.dxl.robot_set_operating_modes("group", "arm", "position")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_on(bot)
+
+def setup_master_bot(bot):
+    bot.dxl.robot_set_operating_modes("group", "arm", "pwm")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_off(bot)
+
+def set_standard_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_P_Gain', 800)
+    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_I_Gain', 0)
+
+def set_low_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_P_Gain', 100)
+    bot.dxl.robot_set_motor_registers("group", "arm", 'Position_I_Gain', 0)
+
+def torque_off(bot):
+    bot.dxl.robot_torque_enable("group", "arm", False)
+    bot.dxl.robot_torque_enable("single", "gripper", False)
+
+def torque_on(bot):
+    bot.dxl.robot_torque_enable("group", "arm", True)
+    bot.dxl.robot_torque_enable("single", "gripper", True)
\ No newline at end of file
diff --git a/experiments/robot/aloha/run_aloha_eval.py b/experiments/robot/aloha/run_aloha_eval.py
new file mode 100644
index 0000000..b3da22c
--- /dev/null
+++ b/experiments/robot/aloha/run_aloha_eval.py
@@ -0,0 +1,449 @@
+"""
+run_aloha_eval_local.py
+
+Evaluates a model in a real-world ALOHA environment with local model deployment.
+"""
+
+import logging
+import os
+import sys
+import time
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import draccus
+import tqdm
+import torch
+import json
+import numpy as np
+
+# Append current directory so that interpreter can find experiments.robot
+sys.path.append(".")
+from experiments.robot.aloha.aloha_utils import (
+    get_aloha_env,
+    get_aloha_image,
+    get_aloha_wrist_images,
+    get_next_task_label,
+    save_rollout_video,
+)
+from experiments.robot.openvla_utils import (
+    get_vla,
+    get_vla_action,
+    get_action_head,
+    get_processor,
+    get_proprio_projector,
+)
+from experiments.robot.robot_utils import (
+    DATE_TIME,
+    get_image_resize_size,
+    set_seed_everywhere,
+)
+
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[logging.StreamHandler()],
+)
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class GenerateConfig:
+    # fmt: off
+
+    #################################################################################################################
+    # Model-specific parameters
+    #################################################################################################################
+    model_family: str = "openvla"  # Model family
+    pretrained_checkpoint: Union[str, Path] = ""  # Pretrained checkpoint path
+
+    use_l1_regression: bool = True  # If True, uses continuous action head with L1 regression objective
+    use_diffusion: bool = False  # If True, uses continuous action head with diffusion modeling objective (DDIM)
+    num_diffusion_steps: int = 50  # (When `diffusion==True`) Number of diffusion steps for inference
+    use_film: bool = False  # If True, uses FiLM to infuse language inputs into visual features
+    num_images_in_input: int = 3  # Number of images in the VLA input (default: 3)
+    use_proprio: bool = True  # Whether to include proprio state in input
+
+    center_crop: bool = True  # Center crop? (if trained w/ random crop image aug)
+    num_open_loop_steps: int = 25  # Number of actions to execute open-loop before requerying policy
+
+    unnorm_key: Union[str, Path] = ""  # Action un-normalization key
+
+    load_in_8bit: bool = False  # (For OpenVLA only) Load with 8-bit quantization
+    load_in_4bit: bool = False  # (For OpenVLA only) Load with 4-bit quantization
+
+    #################################################################################################################
+    # ALOHA environment-specific parameters
+    #################################################################################################################
+    num_rollouts_planned: int = 50  # Number of test rollouts
+    max_steps: int = 1500  # Max number of steps per rollout
+    use_relative_actions: bool = False  # Whether to use relative actions (delta joint angles)
+
+    #################################################################################################################
+    # Utils
+    #################################################################################################################
+    run_id_note: Optional[str] = None  # Extra note to add to end of run ID for logging
+    local_log_dir: str = "./experiments/logs"  # Local directory for eval logs
+
+    seed: int = 7  # Random Seed (for reproducibility)
+
+    save_version: str = "vla-adapter"  # version of
+    use_pro_version: bool = True  # encourage to use the pro models we released.
+    phase: str = "Inference"
+
+    # fmt: on
+
+
+class LocalVLAModel:
+    """Local VLA model for direct inference without server."""
+
+    def __init__(self, cfg: GenerateConfig):
+        self.cfg = cfg
+
+        # Load model
+        self.vla = get_vla(cfg)
+
+        # Load proprio projector
+        self.proprio_projector = None
+        if cfg.use_proprio:
+            self.proprio_projector = get_proprio_projector(cfg, self.vla.llm_dim, 7)  # PROPRIO_DIM = 14
+
+        # Load continuous action head
+        self.action_head = None
+        if cfg.use_l1_regression or cfg.use_diffusion:
+            self.action_head = get_action_head(cfg, self.vla.llm_dim)
+
+        # Check that the model contains the action un-normalization key
+        assert cfg.unnorm_key in self.vla.norm_stats, f"Action un-norm key {cfg.unnorm_key} not found in VLA `norm_stats`!"
+
+        # Get Hugging Face processor
+        self.processor = get_processor(cfg)
+
+        # Get expected image dimensions
+        self.resize_size = get_image_resize_size(cfg)
+
+        logger.info("Local VLA model loaded successfully")
+
+    def get_action(self, observation: Dict[str, Any]) -> np.ndarray:
+        """Get action from local model."""
+        instruction = observation["instruction"]
+
+        action = get_vla_action(
+            self.cfg,
+            self.vla,
+            self.processor,
+            observation,
+            instruction,
+            action_head=self.action_head,
+            proprio_projector=self.proprio_projector,
+            use_film=self.cfg.use_film,
+        )
+
+        return action
+
+
+def validate_config(cfg: GenerateConfig) -> None:
+    """Validate configuration parameters."""
+    assert cfg.pretrained_checkpoint, "Must provide pretrained_checkpoint for local model deployment!"
+    assert os.path.exists(cfg.pretrained_checkpoint), f"Checkpoint path {cfg.pretrained_checkpoint} does not exist!"
+
+
+def setup_logging(cfg: GenerateConfig):
+    """Set up logging to file."""
+    # Create run ID
+    run_id = f"EVAL-LOCAL-{cfg.model_family}-{DATE_TIME}"
+    if cfg.run_id_note is not None:
+        run_id += f"--{cfg.run_id_note}"
+
+    # Set up local logging
+    os.makedirs(cfg.local_log_dir, exist_ok=True)
+    local_log_filepath = os.path.join(cfg.local_log_dir, run_id + ".txt")
+    log_file = open(local_log_filepath, "w")
+    logger.info(f"Logging to local log file: {local_log_filepath}")
+
+    return log_file, local_log_filepath, run_id
+
+
+def log_message(message: str, log_file=None):
+    """Log a message to console and optionally to a log file."""
+    print(message)
+    logger.info(message)
+    if log_file:
+        log_file.write(message + "\n")
+        log_file.flush()
+
+
+def prepare_observation(obs, resize_size):
+    """Prepare observation for policy input."""
+    # Get preprocessed images
+    img = get_aloha_image(obs)
+    right_wrist_img = get_aloha_wrist_images(obs)
+
+    # Resize images to size expected by model
+    from experiments.robot.openvla_utils import resize_image_for_policy
+    img_resized = resize_image_for_policy(img, resize_size)
+    # left_wrist_img_resized = resize_image_for_policy(left_wrist_img, resize_size)
+    right_wrist_img_resized = resize_image_for_policy(right_wrist_img, resize_size)
+
+    # Prepare observations dict
+    observation = {
+        "full_image": img_resized,
+        # "left_wrist_image": left_wrist_img_resized,
+        "right_wrist_image": right_wrist_img_resized,
+        "state": obs.observation["qpos"],
+    }
+
+    # return observation, img_resized, left_wrist_img_resized, right_wrist_img_resized
+    return observation, img_resized, right_wrist_img_resized
+
+
+def run_episode(
+        cfg: GenerateConfig,
+        env,
+        task_description: str,
+        local_model: LocalVLAModel,
+        resize_size,
+        log_file=None,
+):
+    """Run a single episode in the ALOHA environment."""
+    # Define control frequency
+    STEP_DURATION_IN_SEC = 1.0 / 50.0
+
+    # Reset environment
+    obs = env.reset()
+
+    # Initialize action queue
+    action_queue = deque(maxlen=cfg.num_open_loop_steps)
+
+    # Setup
+    t = 0
+    curr_state = None
+    replay_images = []
+    replay_images_resized = []
+    replay_images_left_wrist_resized = []
+    replay_images_right_wrist_resized = []
+
+    log_message("Prepare the scene, and then press Enter to begin...", log_file)
+    input()
+
+    # Reset environment again to fetch first timestep observation
+    obs = env.reset()
+
+    # Fetch initial robot state (but sleep first so that robot stops moving)
+    time.sleep(2)
+    curr_state = env.get_qpos()
+
+    episode_start_time = time.time()
+    total_model_query_time = 0.0
+
+    try:
+        while t < cfg.max_steps:
+            # Get step start time (used to compute how much to sleep between steps)
+            step_start_time = time.time()
+
+            # Get observation
+            obs = env.get_observation(t=t)
+
+            # Save raw high camera image for replay video
+            replay_images.append(obs.observation["images"]["cam_high"])
+
+            # If action queue is empty, requery model
+            if len(action_queue) == 0:
+                # Prepare observation
+                observation, img_resized, right_wrist_resized = prepare_observation(obs, resize_size)
+                observation["instruction"] = task_description
+
+                # Save processed images for replay
+                replay_images_resized.append(img_resized)
+                # replay_images_left_wrist_resized.append(left_wrist_resized)
+                replay_images_right_wrist_resized.append(right_wrist_resized)
+
+                # Query model to get action
+                log_message("Querying local model...", log_file)
+                model_query_start_time = time.time()
+                actions = local_model.get_action(observation)
+                actions = actions[: cfg.num_open_loop_steps]
+                total_model_query_time += time.time() - model_query_start_time
+                action_queue.extend(actions)
+
+            # Get action from queue
+            action = action_queue.popleft()
+            log_message("-----------------------------------------------------", log_file)
+            log_message(f"t: {t}", log_file)
+            log_message(f"action: {action}", log_file)
+
+            # Execute action in environment
+            if cfg.use_relative_actions:
+                # Get absolute joint angles from relative action
+                rel_action = action
+                target_state = curr_state + rel_action
+                obs = env.step(target_state.tolist())
+                # Update current state (assume it is the commanded target state)
+                curr_state = target_state
+            else:
+                obs = env.step(action.tolist())
+            t += 1
+
+            # Sleep until next timestep
+            step_elapsed_time = time.time() - step_start_time
+            if step_elapsed_time < STEP_DURATION_IN_SEC:
+                time_to_sleep = STEP_DURATION_IN_SEC - step_elapsed_time
+                log_message(f"Sleeping {time_to_sleep} sec...", log_file)
+                time.sleep(time_to_sleep)
+
+    except (KeyboardInterrupt, Exception) as e:
+        if isinstance(e, KeyboardInterrupt):
+            log_message("\nCaught KeyboardInterrupt: Terminating episode early.", log_file)
+        else:
+            log_message(f"\nCaught exception: {e}", log_file)
+
+    episode_end_time = time.time()
+
+    # Get success feedback from user
+    user_input = input("Success? Enter 'y' or 'n': ")
+    success = True if user_input.lower() == 'y' else False
+
+    # Calculate episode statistics
+    episode_stats = {
+        "success": success,
+        "total_steps": t,
+        "model_query_time": total_model_query_time,
+        "episode_duration": episode_end_time - episode_start_time,
+    }
+
+    return (
+        episode_stats,
+        replay_images,
+        replay_images_resized,
+        replay_images_left_wrist_resized,
+        replay_images_right_wrist_resized,
+    )
+
+
+def save_episode_videos(
+        replay_images,
+        replay_images_resized,
+        replay_images_left_wrist,
+        replay_images_right_wrist,
+        episode_idx,
+        success,
+        task_description,
+        log_file=None,
+):
+    """Save videos of the episode from different camera angles."""
+    # Save main replay video
+    save_rollout_video(replay_images, episode_idx, success=success, task_description=task_description,
+                       log_file=log_file)
+
+    # Save processed view videos
+    save_rollout_video(
+        replay_images_resized,
+        episode_idx,
+        success=success,
+        task_description=task_description,
+        log_file=log_file,
+        notes="resized",
+    )
+    save_rollout_video(
+        replay_images_left_wrist,
+        episode_idx,
+        success=success,
+        task_description=task_description,
+        log_file=log_file,
+        notes="left_wrist_resized",
+    )
+    save_rollout_video(
+        replay_images_right_wrist,
+        episode_idx,
+        success=success,
+        task_description=task_description,
+        log_file=log_file,
+        notes="right_wrist_resized",
+    )
+
+
+@draccus.wrap()
+def eval_aloha_local(cfg: GenerateConfig) -> None:
+    """Main function to evaluate a trained policy in a real-world ALOHA environment with local model."""
+    # Validate configuration
+    validate_config(cfg)
+
+    # Set random seed
+    set_seed_everywhere(cfg.seed)
+
+    # Setup logging
+    log_file, local_log_filepath, run_id = setup_logging(cfg)
+
+    # Load local model
+    log_message("Loading local VLA model...", log_file)
+    local_model = LocalVLAModel(cfg)
+
+    # Get expected image dimensions
+    resize_size = get_image_resize_size(cfg)
+
+    # Get ALOHA environment
+    env = get_aloha_env()
+
+    # Initialize task description
+    task_description = ""
+
+    # Start evaluation
+    num_rollouts_completed, total_successes = 0, 0
+
+    for episode_idx in tqdm.tqdm(range(cfg.num_rollouts_planned)):
+        # Get task description from user
+        task_description = get_next_task_label(task_description)
+        log_message(f"\nTask: {task_description}", log_file)
+
+        log_message(f"Starting episode {num_rollouts_completed + 1}...", log_file)
+
+        # Run episode
+        episode_stats, replay_images, replay_images_resized, replay_images_left_wrist, replay_images_right_wrist = (
+            run_episode(cfg, env, task_description, local_model, resize_size, log_file)
+        )
+
+        # Update counters
+        num_rollouts_completed += 1
+        if episode_stats["success"]:
+            total_successes += 1
+
+        # Save videos
+        save_episode_videos(
+            replay_images,
+            replay_images_resized,
+            replay_images_left_wrist,
+            replay_images_right_wrist,
+            num_rollouts_completed,
+            episode_stats["success"],
+            task_description,
+            log_file,
+        )
+
+        # Log results
+        log_message(f"Success: {episode_stats['success']}", log_file)
+        log_message(f"# episodes completed so far: {num_rollouts_completed}", log_file)
+        log_message(f"# successes: {total_successes} ({total_successes / num_rollouts_completed * 100:.1f}%)", log_file)
+        log_message(f"Total model query time: {episode_stats['model_query_time']:.2f} sec", log_file)
+        log_message(f"Total episode elapsed time: {episode_stats['episode_duration']:.2f} sec", log_file)
+
+    # Calculate final success rate
+    final_success_rate = float(total_successes) / float(num_rollouts_completed) if num_rollouts_completed > 0 else 0
+
+    # Log final results
+    log_message("\nFinal results:", log_file)
+    log_message(f"Total episodes: {num_rollouts_completed}", log_file)
+    log_message(f"Total successes: {total_successes}", log_file)
+    log_message(f"Overall success rate: {final_success_rate:.4f} ({final_success_rate * 100:.1f}%)", log_file)
+
+    # Close log file
+    if log_file:
+        log_file.close()
+
+    return final_success_rate
+
+
+if __name__ == "__main__":
+    eval_aloha_local()
\ No newline at end of file
diff --git a/prismatic/vla/constants.py b/prismatic/vla/constants.py
index 0985a83..f604309 100644
--- a/prismatic/vla/constants.py
+++ b/prismatic/vla/constants.py
@@ -41,8 +41,8 @@ class NormalizationType(str, Enum):
 
 ALOHA_CONSTANTS = {
     "NUM_ACTIONS_CHUNK": 25,
-    "ACTION_DIM": 14,
-    "PROPRIO_DIM": 14,
+    "ACTION_DIM": 7,
+    "PROPRIO_DIM": 7,
     "ACTION_PROPRIO_NORMALIZATION_TYPE": NormalizationType.BOUNDS,
 }
 
diff --git a/prismatic/vla/datasets/datasets.py b/prismatic/vla/datasets/datasets.py
index 2f2daff..b21d7aa 100644
--- a/prismatic/vla/datasets/datasets.py
+++ b/prismatic/vla/datasets/datasets.py
@@ -167,7 +167,7 @@ def __init__(
 
         # fmt: off
         if "aloha" in self.data_mix:
-            load_camera_views = ("primary", "left_wrist", "right_wrist")
+            load_camera_views = ("primary", "right_wrist")
         else:
             load_camera_views = ("primary", "wrist")
 
diff --git a/prismatic/vla/datasets/rlds/oxe/configs.py b/prismatic/vla/datasets/rlds/oxe/configs.py
index 1367cd7..28e00b5 100644
--- a/prismatic/vla/datasets/rlds/oxe/configs.py
+++ b/prismatic/vla/datasets/rlds/oxe/configs.py
@@ -37,6 +37,7 @@ class StateEncoding(IntEnum):
     POS_QUAT = 2            # EEF XYZ (3) + Quaternion (4) + Gripper Open/Close (1)
     JOINT = 3               # Joint Angles (7, <PAD> if fewer) + Gripper Open/Close (1)
     JOINT_BIMANUAL = 4      # Joint Angles (2 x [ Joint Angles (6) + Gripper Open/Close (1) ])
+    JOINT_SINGLE = 5  # Joint Angles (1 x [ Joint Angles (6) + Gripper Open/Close (1) ])
     # fmt: on
 
 
@@ -47,6 +48,7 @@ class ActionEncoding(IntEnum):
     JOINT_POS = 2           # Joint Delta Position (7) + Gripper Open/Close (1)
     JOINT_POS_BIMANUAL = 3  # Joint Delta Position (2 x [ Joint Delta Position (6) + Gripper Open/Close (1) ])
     EEF_R6 = 4              # EEF Delta XYZ (3) + R6 (6) + Gripper Open/Close (1)
+    JOINT_POS_SINGLE = 5  # Joint Delta Position (1 x [ Joint Delta Position (6) + Gripper Open/Close (1) ])
     # fmt: on
 
 
@@ -720,4 +722,11 @@ class ActionEncoding(IntEnum):
         "state_encoding": StateEncoding.JOINT_BIMANUAL,
         "action_encoding": ActionEncoding.JOINT_POS_BIMANUAL,
     },
+    "aloha_put_x_into_the_box_80_demos": {
+        "image_obs_keys": {"primary": "image", "secondary": None, "left_wrist": None, "right_wrist": "right_wrist_image"},
+        "depth_obs_keys": {"primary": None, "secondary": None, "wrist": None},
+        "state_obs_keys": ["state"],
+        "state_encoding": StateEncoding.JOINT_SINGLE,
+        "action_encoding": ActionEncoding.JOINT_POS_SINGLE,
+    },
 }
diff --git a/prismatic/vla/datasets/rlds/oxe/materialize.py b/prismatic/vla/datasets/rlds/oxe/materialize.py
index fd4103d..0b3209b 100644
--- a/prismatic/vla/datasets/rlds/oxe/materialize.py
+++ b/prismatic/vla/datasets/rlds/oxe/materialize.py
@@ -29,8 +29,8 @@ def make_oxe_dataset_kwargs(
 ) -> Dict[str, Any]:
     """Generates config (kwargs) for given dataset from Open-X Embodiment."""
     dataset_kwargs = deepcopy(OXE_DATASET_CONFIGS[dataset_name])
-    if dataset_kwargs["action_encoding"] not in [ActionEncoding.EEF_POS, ActionEncoding.EEF_R6, ActionEncoding.JOINT_POS_BIMANUAL]:
-        raise ValueError(f"Cannot load `{dataset_name}`; only EEF_POS & EEF_R6 & JOINT_POS_BIMANUAL actions supported!")
+    if dataset_kwargs["action_encoding"] not in [ActionEncoding.EEF_POS, ActionEncoding.EEF_R6, ActionEncoding.JOINT_POS_BIMANUAL, ActionEncoding.JOINT_POS_SINGLE]:
+        raise ValueError(f"Cannot load `{dataset_name}`; only EEF_POS & EEF_R6 & JOINT_POS_BIMANUAL & JOINT_POS_SINGLE actions supported!")
 
     # [Contract] For EEF_POS & EEF_R6 actions, only the last action dimension (gripper) is absolute!
     # Normalize all action dimensions *except* the gripper
@@ -43,6 +43,9 @@ def make_oxe_dataset_kwargs(
     elif dataset_kwargs["action_encoding"] is ActionEncoding.JOINT_POS_BIMANUAL:
         dataset_kwargs["absolute_action_mask"] = [True] * 14
         dataset_kwargs["action_normalization_mask"] = [True] * 14
+    elif dataset_kwargs["action_encoding"] is ActionEncoding.JOINT_POS_SINGLE:
+        dataset_kwargs["absolute_action_mask"] = [True] * 7
+        dataset_kwargs["action_normalization_mask"] = [True] * 7
     dataset_kwargs["action_proprio_normalization_type"] = action_proprio_normalization_type
 
     # Adjust Loaded Camera Views

LIBERO	Methods	Scale	Spatial	Object	Goal	Long	Avg.
Large-scale	FlowVLA (Zhong et al., 2025)	8.5B	93.2	95.0	91.6	72.6	88.1
	UnifiedVLA (Wang et al., 2025)	8.5B	95.4	98.8*	93.6	94.0	95.5
	OpenVLA (Kim et al., 2024)	7B	84.7	88.4	79.2	53.7	76.5
	OpenVLA-OFT (Kim et al., 2025)	7B	97.6*	98.4	97.9	94.5*	97.1*
	UniVLA (Bu et al., 2025)	7B	96.5	96.8	95.6	92.0	95.2
	CoT-VLA (Zhao et al., 2025)	7B	87.5	91.6	87.6	69.0	81.1
	WorldVLA (Cen et al., 2025)	7B	87.6	96.2	83.4	60.0	81.8
	TraceVLA (Zheng et al., 2025)	7B	84.6	85.2	75.1	54.1	74.8
	MolmoAct (Lee et al., 2025)	7B	87.0	95.4	87.6	77.2	86.6
	ThinkAct (Huang et al., 2025)	7B	88.3	91.4	87.1	70.9	84.4
Small-scale	4D-VLA (Zhang et al., 2025)	4B	88.9	95.2	90.9	79.1	88.6
	SpatialVLA (Qu et al., 2025)	4B	88.2	89.9	78.6	55.5	78.1
	π0 (Black et al., 2024)	3B	96.8	98.8*	95.8	85.2	94.2
	π0-FAST (Pertsch et al., 2025)	3B	96.4	96.8	88.6	60.2	85.5
	NORA (Hung et al., 2025)	3B	92.2	95.4	89.4	74.6	87.9
	SmolVLA (Shukor et al., 2025)	2.2B	93.0	94.0	91.0	77.0	88.8
	GR00T N1 (NVIDIA et al., 2025)	2B	94.4	97.6	93.0	90.6	93.9
Tiny-scale	Seer (Tian et al., 2025)	0.57B	-	-	-	78.7	78.7
	VLA-OS (Gao et al., 2025)	0.5B	87.0	96.5	92.7	66.0	85.6
	Diffusion Policy (Chi et al., 2023)	-	78.3	92.5	68.3	50.5	72.4
	VLA-Adapter (Ours)	0.5B	97.8	99.2	97.2*	95.0	97.3
	VLA-Adapter-Pro (Ours)	0.5B	*99.6*	*99.6*	*98.2*	*96.4*	*98.5*
CALVIN	Methods	Scale	1	2	3	4	5	Avg. len
Large-scale	UniVLA (Bu et al., 2025)	7B	95.5	85.8	75.4	66.9	56.5	3.80
	OpenVLA (Kim et al., 2024)	7B	91.3	77.8	62.0	52.1	43.5	3.27
	OpenVLA-OFT (Kim et al., 2025)	7B	96.3	89.1	82.4	75.8	66.5	4.10
	VLAS (Zhao et al., 2025b)	7B	87.2	64.2	40.9	28.1	19.6	2.40
	LCB (Shentu et al., 2024)	7B	73.6	50.2	28.5	16.0	9.9	1.78
	RoboDual (Bu et al., 2024a)	7B	94.4	82.7	72.1	62.4	54.4	3.66
	OpenHelix (Cui et al., 2025)	7B	97.1*	91.4	82.8	72.6	64.1	4.08
	ReconVLA (Song et al., 2025c)	7B	95.6	87.6	76.9	69.3	64.1	3.95
Small-scale	DeeR (Yue et al., 2024)	3B	86.2	70.1	51.8	41.5	30.4	2.82
	RoboFlamingo (Li et al., 2024b)	3B	82.4	61.9	46.6	33.1	23.5	2.48
	VPP (Hu et al., 2025)	1.5B	95.7	91.2	86.3*	81.0*	75.0*	4.33*
	SuSIE (Black et al., 2024)	1.3B	87.0	69.0	49.0	38.0	26.0	2.69
Tiny-scale	Seer-Large (Tian et al., 2025)	0.57B	96.3	91.6*	86.1	80.3	74.0	4.28
	MoDE (Reuss et al., 2025)	0.44B	96.2	88.9	81.1	71.8	63.5	4.01
	Seer (Tian et al., 2025)	0.32B	94.4	87.2	79.9	72.2	64.3	3.98
	VLA-Adapter (Ours)	0.5B	*99.1*	94.6	88.8	82.8	76.5	4.42
	VLA-Adapter-Pro (Ours)	0.5B	98.5	*95.0*	*90.5*	*85.3*	*80.0*	*4.50*