hello-robot · hello-cpaxton · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
diff --git a/src/stretch/agent/robot_agent.py b/src/stretch/agent/robot_agent.py
@@ -269,6 +269,14 @@ def _start_threads(self):
             self._update_map_thread = None
             self._get_observations_thread = None
 
+    def get_parameters(self) -> Parameters:
+        """Return the parameters for this model"""
+        return self.parameters
+
+    def get_semantic_sensor(self) -> OvmmPerception:
+        """Return the semantic sensor in use by this model"""
+        return self.semantic_sensor
+
     def get_robot(self) -> AbstractRobotClient:
         """Return the robot in use by this model"""
         return self.robot
@@ -578,7 +586,7 @@ def rotate_in_place(
 
     def get_command(self):
         """Get a command from config file or from user input if not specified."""
-        task = self.parameters.get("task").get("command")
+        task = self.parameters.get("task/command")
         if task is not None and len(task) > 0:
             return task
         else:
@@ -2332,12 +2340,12 @@ def get_object_centric_observations(
             scene_graph = self.extract_symbolic_spatial_info(instances)
 
         world_representation = self.get_object_centric_world_representation(
-            instances,
-            self.parameters.get("vlm/vlm_context_length", 20),
-            self.parameters.get("vlm/sample_strategy", "clip"),
-            task,
-            self.encode_text(task),
-            scene_graph,
+            instance_memory=instances,
+            task=task,
+            text_features=self.encode_text(task),
+            max_context_length=self.parameters.get("vlm/vlm_context_length", 20),
+            sample_strategy=self.parameters.get("vlm/sample_strategy", "clip"),
+            scene_graph=scene_graph,
         )
         return world_representation
 

diff --git a/src/stretch/agent/vlm_planner.py b/src/stretch/agent/vlm_planner.py
@@ -94,35 +94,46 @@ def plan(
             plan_with_scene_graph=plan_with_scene_graph,
         )
         output = self.get_output_from_gpt(world_representation, task=query)
-        if show_plan:
+
+        print()
+        print("Plan from VLMs:")
+        print(output)
+        print()
+        if output == "explore":
+            print(">>>>>> Planner cannot find a plan, the robot should explore more >>>>>>>>>")
+            actions = None
+        elif output == "gpt API error":
+            print(">>>>>> there is something wrong with the planner api >>>>>>>>>")
+            actions = None
+        else:
+            actions = output.split("; ")
+
+        if show_plan and actions is not None:
             import re
 
             import matplotlib.pyplot as plt
 
-            if output == "explore":
-                print(">>>>>> Planner cannot find a plan, the robot should explore more >>>>>>>>>")
-            elif output == "gpt API error":
-                print(">>>>>> there is something wrong with the planner api >>>>>>>>>")
-            else:
-                actions = output.split("; ")
-                plt.clf()
-                for action_id, action in enumerate(actions):
-                    crop_id = int(re.search(r"img_(\d+)", action).group(1))
-                    global_id = world_representation.object_images[crop_id].instance_id
-                    plt.subplot(1, len(actions), action_id + 1)
-                    plt.imshow(
-                        self.voxel_map.get_instances()[global_id].get_best_view().get_image()
-                    )
-                    plt.title(action.split("(")[0] + f" instance {global_id}")
-                    plt.axis("off")
-                plt.suptitle(f"Task: {query}")
-                plt.show()
-                plt.savefig("plan.png")
+            plt.clf()
+            for action_id, action in enumerate(actions):
+                res = re.search(r"img_(\d+)", action)
+                if res is None:
+                    print(f"{action_id}: Action {action} does not have a crop id")
+                    continue
+                crop_id = int(res.group(1))
+                global_id = world_representation.object_images[crop_id].instance_id
+                plt.subplot(1, len(actions), action_id + 1)
+                plt.imshow(self.voxel_map.get_instances()[global_id].get_best_view().get_image())
+                plt.title(action.split("(")[0] + f" instance {global_id}")
+                plt.axis("off")
+            plt.suptitle(f"Task: {query}")
+            plt.show()
+            plt.savefig("plan.png")
 
         if self.parameters.get("save_vlm_plan", True):
             with open(plan_file, "w") as f:
                 f.write(output)
             print(f"Task plan generated from VLMs has been written to {plan_file}")
+
         return actions, world_representation
 
     def get_output_from_gpt(self, world_rep, task: str):

diff --git a/src/stretch/app/vlm_planning.py b/src/stretch/app/vlm_planning.py
@@ -26,7 +26,7 @@
 
 from stretch.agent import RobotAgent
 from stretch.agent.vlm_planner import VLMPlanner
-from stretch.agent.zmq_client import HomeRobotZMQClient
+from stretch.agent.zmq_client import HomeRobotZmqClient
 from stretch.core import get_parameters
 from stretch.core.interfaces import Observations
 from stretch.perception import create_semantic_sensor
@@ -107,7 +107,6 @@ def images_to_video(image_list, output_path, fps=30):
     default="",
     help="Input path. If empty, run on the real robot.",
 )
-@click.option("--local", is_flag=True, help="Run code locally on the robot.")
 @click.option("--robot_ip", default="")
 @click.option("--task", "-t", type=str, default="", help="Task to run with the planner.")
 @click.option(
@@ -145,12 +144,14 @@ def main(
     show_instances: bool = False,
     api_key: str = None,
     task: str = "",
-    local: bool = False,
     robot_ip: str = "",
 ):
     """Simple script to load a voxel map"""
-    input_path = Path(input_path)
-    print("Loading:", input_path)
+    if len(input_path) > 0:
+        input_path = Path(input_path)
+        print("Loading:", input_path)
+    else:
+        input_path = None
 
     loaded_voxel_map = None
 
@@ -172,10 +173,11 @@ def main(
     print("Creating semantic sensors...")
     semantic_sensor = create_semantic_sensor(parameters=vlm_parameters)
 
-    if len(input_path) > 0:
+    if input_path is not None:
+        # We will load data from a pickle file
         robot = DummyStretchClient()
     else:
-        robot = HomeRobotZMQClient(robot_ip=robot_ip, local=local)
+        robot = HomeRobotZmqClient(robot_ip=robot_ip)
 
     print("Creating robot agent...")
     agent = RobotAgent(
@@ -186,11 +188,12 @@ def main(
     )
     voxel_map = agent.get_voxel_map()
 
-    if len(input_path) > 0:
+    if input_path is not None:
         # load from pickle
         voxel_map.read_from_pickle(input_path, num_frames=frame, perception=semantic_sensor)
     else:
         # Scan the local area to get a map
+        agent.start()
         agent.rotate_in_place()
 
     # get the task
@@ -207,6 +210,9 @@ def main(
     # )
     run_vlm_planner(agent, task, show_svm, test_vlm, api_key, show_instances)
 
+    # Finish up
+    robot.stop()
+
 
 def run_vlm_planner(
     agent,
@@ -249,110 +255,112 @@ def run_vlm_planner(
         footprint = robot.get_footprint()
         print(f"{x0} valid = {space.is_valid(x0)}")
         voxel_map.show(instances=show_instances, orig=start_xyz, xyt=x0, footprint=footprint)
+        return
 
-    if test_vlm:
-        start_is_valid = space.is_valid(x0, verbose=True, debug=False)
-        if not start_is_valid:
-            print("you need to manually set the start pose to be valid")
-            return
+    start_is_valid = space.is_valid(x0, verbose=True, debug=False)
+    if not start_is_valid:
+        print("you need to manually set the start pose to be valid")
+        return
 
-        print("\nFirst plan with the original map: ")
-        original_plan, world_rep = vlm_planner.plan(
-            current_pose=x0,
-            show_plan=True,
-            query=task,
-            plan_with_reachable_instances=False,
-            plan_with_scene_graph=False,
-        )
+    print("\nFirst plan with the original map: ")
+    original_plan, world_rep = vlm_planner.plan(
+        current_pose=x0,
+        show_plan=True,
+        query=task,
+        plan_with_reachable_instances=False,
+        plan_with_scene_graph=False,
+    )
 
-        # loop over the plan and check feasibilities for each action
-        preconditions = {}
-        while len(original_plan) > 0:
-            current_action = original_plan.pop(0)
+    # loop over the plan and check feasibilities for each action
+    preconditions = {}
+    while len(original_plan) > 0:
+        current_action = original_plan.pop(0)
 
-            # navigation action only for now
-            if "goto" not in current_action:
-                continue
+        # navigation action only for now
+        if "goto" not in current_action:
+            continue
 
-            # get the target object instance from the action
-            crop_id = int(re.search(r"img_(\d+)", current_action).group(1))
-            global_id = world_rep.object_images[crop_id].instance_id
-            current_instance = voxel_map.get_instances()[global_id]
+        # get the target object instance from the action
+        crop_id = int(re.search(r"img_(\d+)", current_action).group(1))
+        global_id = world_rep.object_images[crop_id].instance_id
+        current_instance = voxel_map.get_instances()[global_id]
 
-            print(f"Checking feasibility of action: {current_action}")
-            motion_plan = agent.plan_to_instance_for_manipulation(
-                current_instance, start=np.array(start_xyz)
-            )
-            feasible = motion_plan.get_success()
+        print(f"Checking feasibility of action: {current_action}")
+        motion_plan = agent.plan_to_instance_for_manipulation(
+            current_instance, start=np.array(start_xyz)
+        )
+        feasible = motion_plan.get_success()
 
-            if feasible:
+        if feasible:
+            continue
+
+        print(f"Action {current_action} is not feasible.")
+        print("Searching over the map and replanning...")
+        # loop over all instances in the map and try to find a feasible action
+        # TODO: This is highly inefficient and should be replaced with scene graph
+        planning_map = copy.deepcopy(voxel_map)
+        planning_xyz = start_xyz
+        for removed_instance in voxel_map.get_instances():
+
+            # skip the current instance
+            if removed_instance == current_instance:
                 continue
 
-            print(f"Action {current_action} is not feasible.")
-            print("Searching over the map and replanning...")
-            # loop over all instances in the map and try to find a feasible action
-            # TODO: This is highly inefficient and should be replaced with scene graph
-            planning_map = copy.deepcopy(voxel_map)
-            planning_xyz = start_xyz
-            for removed_instance in voxel_map.get_instances():
-
-                # skip the current instance
-                if removed_instance == current_instance:
-                    continue
-
-                # manually remove an instance for testing planning.
-                new_map = copy.deepcopy(planning_map)
-                new_map.delete_instance(
-                    removed_instance, force_update=True, min_bound_z=0.05, assume_explored=True
-                )
+            # manually remove an instance for testing planning.
+            new_map = copy.deepcopy(planning_map)
+            new_map.delete_instance(
+                removed_instance, force_update=True, min_bound_z=0.05, assume_explored=True
+            )
 
-                # # visulize the deleted_instance and the new map
-                # pcd = o3d.geometry.PointCloud()
-                # pcd.points = o3d.utility.Vector3dVector(removed_instance.point_cloud.numpy())
-                # o3d.visualization.draw_geometries([pcd])
-                # new_map.show()
-
-                # create a new agent for planning with the updated map
-                planning_agent = RobotAgent(
-                    robot,
-                    vlm_parameters,
-                    voxel_map=new_map,
-                    semantic_sensor=semantic_sensor,
-                )
+            # # visulize the deleted_instance and the new map
+            # pcd = o3d.geometry.PointCloud()
+            # pcd.points = o3d.utility.Vector3dVector(removed_instance.point_cloud.numpy())
+            # o3d.visualization.draw_geometries([pcd])
+            # new_map.show()
+
+            # create a new agent for planning with the updated map
+            planning_agent = RobotAgent(
+                robot,
+                vlm_parameters,
+                voxel_map=new_map,
+                semantic_sensor=semantic_sensor,
+            )
 
-                # motion planning with the new map
-                motion_plan = planning_agent.plan_to_instance_for_manipulation(
-                    removed_instance, start=np.array(planning_xyz)
-                )
-                feasible = motion_plan.get_success()
-                if feasible:
-                    print(
-                        f"Found a feasible motion plan for action: {current_action} by removing instance {removed_instance.global_id}"
-                    )
-                    planning_xyz = motion_plan.get_trajectory()[-1].state
-                    planning_map = new_map
-
-                    # find img_id of the removed instance
-                    for obj_im in world_rep.object_images:
-                        if obj_im.instance_id == removed_instance.global_id:
-                            removed_crop_id = obj_im.crop_id
-                            break
-
-                    # append preconditions
-                    preconditions[current_action] = removed_crop_id
-                    break
-
-            print("\nPlan with the task with preconditions: ")
-            print(preconditions)
-            for action, crop_id in preconditions.items():
-                task += f" Before {action}, relocate img_{crop_id} to another instance."
-            vlm_planner.plan(
-                current_pose=x0,
-                show_plan=True,
-                query=task,
-                plan_with_reachable_instances=False,
-                plan_with_scene_graph=False,
+            # motion planning with the new map
+            motion_plan = planning_agent.plan_to_instance_for_manipulation(
+                removed_instance, start=np.array(planning_xyz)
             )
+            feasible = motion_plan.get_success()
+            if feasible:
+                print(
+                    f"Found a feasible motion plan for action: {current_action} by removing instance {removed_instance.global_id}"
+                )
+                planning_xyz = motion_plan.get_trajectory()[-1].state
+                planning_map = new_map
+
+                # find img_id of the removed instance
+                for obj_im in world_rep.object_images:
+                    if obj_im.instance_id == removed_instance.global_id:
+                        removed_crop_id = obj_im.crop_id
+                        break
+
+                # append preconditions
+                preconditions[current_action] = removed_crop_id
+                break
+
+        print("\nPlan with the task with preconditions: ")
+        print(preconditions)
+        for action, crop_id in preconditions.items():
+            task += f" Before {action}, relocate img_{crop_id} to another instance."
+
+        # Run the planner
+        vlm_planner.plan(
+            current_pose=x0,
+            show_plan=True,
+            query=task,
+            plan_with_reachable_instances=False,
+            plan_with_scene_graph=False,
+        )
 
 
 if __name__ == "__main__":

diff --git a/src/stretch/config/app/vlm_planning/multi_crop_vlm_planner.yaml b/src/stretch/config/app/vlm_planning/multi_crop_vlm_planner.yaml
@@ -26,5 +26,5 @@ vlm:
   replanning: False
   sample_strategy: "clip"
 
-task:
-  command: "Pick up a basket."
+# task:
+#  command: "Pick up a basket."