Introduction: Robots Must Move
All tasks in this series so far assumed the robot is stationary with the workspace within reach. But the real world doesn't work that way — a household robot needs to walk to the kitchen, open cabinets, retrieve items, and bring them to the table. A warehouse robot needs to navigate between shelves, pick products, and deliver them to packing stations.
This is mobile manipulation — combining navigation with manipulation. This post — the 8th in the VLA & LeRobot Mastery series — explores how to extend systems from fixed-base to mobile, from action space challenges to whole-body coordination.
Mobile Manipulation Architecture
Extended Action Space
Moving from fixed-base to mobile base adds significant degrees of freedom:
| System | Action Dimensions | Details |
|---|---|---|
| Fixed single-arm | 7 | 6 joints + 1 gripper |
| Fixed dual-arm | 14 | 2 x (6 joints + 1 gripper) |
| Mobile single-arm | 9-10 | 2-3 base + 6 joints + 1 gripper |
| Mobile dual-arm | 16-17 | 2-3 base + 2 x (6 joints + 1 gripper) |
| Mobile ALOHA | 16 | 2 base vel + 2 x (6 joints + 1 gripper) |
import numpy as np
from dataclasses import dataclass
@dataclass
class MobileManipActionSpace:
"""Action space for mobile manipulation robot.
Base: [v_x, v_y, omega] or [v_linear, omega] (differential drive)
Left arm: [j1, j2, j3, j4, j5, j6, gripper]
Right arm: [j1, j2, j3, j4, j5, j6, gripper]
"""
base_type: str = "holonomic"
n_arms: int = 2
joints_per_arm: int = 6
@property
def base_dim(self):
return 3 if self.base_type == "holonomic" else 2
@property
def arm_dim(self):
return (self.joints_per_arm + 1) * self.n_arms
@property
def total_dim(self):
return self.base_dim + self.arm_dim
def split_action(self, action):
"""Split action into base and arm components."""
base_action = action[:self.base_dim]
arm_actions = action[self.base_dim:]
if self.n_arms == 2:
arm_dim_each = (self.joints_per_arm + 1)
left_arm = arm_actions[:arm_dim_each]
right_arm = arm_actions[arm_dim_each:]
return base_action, left_arm, right_arm
else:
return base_action, arm_actions
action_space = MobileManipActionSpace(
base_type="differential",
n_arms=2,
joints_per_arm=6,
)
print(f"Total action dim: {action_space.total_dim}")
# Output: Total action dim: 16
Navigation-Manipulation Coordination
class NavManipCoordinator:
"""Coordinates navigation and manipulation.
3 modes:
1. NAVIGATE: Only base moves, arms in safe position
2. MANIPULATE: Base stationary, arms operate
3. COORDINATED: Base and arms operate simultaneously
"""
def __init__(self):
self.mode = "NAVIGATE"
self.base_threshold = 0.3
self.arm_safe_position = np.array([0, -1.5, 1.0, 0, 0.5, 0, 1.0])
def decide_mode(self, robot_pos, target_pos, task_phase):
"""Decide operating mode."""
distance = np.linalg.norm(robot_pos[:2] - target_pos[:2])
if task_phase == "approach" and distance > self.base_threshold:
return "NAVIGATE"
elif task_phase == "manipulate":
return "MANIPULATE"
else:
return "COORDINATED"
def apply_mode(self, action, mode):
"""Apply constraints based on mode."""
base_action = action[:2]
arm_action = action[2:]
if mode == "NAVIGATE":
return np.concatenate([base_action, self.arm_safe_position, self.arm_safe_position])
elif mode == "MANIPULATE":
return np.concatenate([np.zeros(2), arm_action])
else:
base_action *= 0.5
return np.concatenate([base_action, arm_action])
Simulation Environments for Mobile Manipulation
Habitat and AI2-THOR
# === Habitat (Meta) ===
import habitat
def create_habitat_env():
"""Create Habitat environment for mobile manipulation."""
config = habitat.get_config(
"benchmark/nav/pointnav/pointnav_habitat_test.yaml"
)
config.defrost()
config.TASK.TYPE = "RearrangeTask-v0"
config.TASK.ACTIONS.ARM_ACTION = habitat.config.Config()
config.TASK.ACTIONS.ARM_ACTION.TYPE = "ArmAction"
config.TASK.ACTIONS.BASE_VELOCITY = habitat.config.Config()
config.TASK.ACTIONS.BASE_VELOCITY.TYPE = "BaseVelAction"
config.freeze()
env = habitat.Env(config=config)
return env
# === AI2-THOR (Allen AI) ===
from ai2thor.controller import Controller
def create_ai2thor_env():
"""Create AI2-THOR interactive environment."""
controller = Controller(
scene="FloorPlan1",
gridSize=0.25,
renderDepthImage=True,
renderInstanceSegmentation=True,
width=640,
height=480,
)
return controller
MuJoCo for Mobile Manipulation
import mujoco
import numpy as np
class MuJoCoMobileManip:
"""MuJoCo environment for mobile manipulation.
Robot: Mobile base + 2 arms
Task: Navigate to table, pick object, bring to another location
"""
def __init__(self, xml_path):
self.model = mujoco.MjModel.from_xml_path(xml_path)
self.data = mujoco.MjData(self.model)
self.base_joints = [0, 1]
self.left_arm_joints = [2, 3, 4, 5, 6, 7, 8]
self.right_arm_joints = [9, 10, 11, 12, 13, 14, 15]
def step(self, action):
"""Execute action and return observation."""
base_vel = action[:2]
left_arm = action[2:9]
right_arm = action[9:16]
self.data.ctrl[self.base_joints] = base_vel
self.data.ctrl[self.left_arm_joints] = left_arm
self.data.ctrl[self.right_arm_joints] = right_arm
mujoco.mj_step(self.model, self.data)
obs = self._get_obs()
reward = self._compute_reward()
done = self._check_done()
return obs, reward, done, {}
def _get_obs(self):
return {
"base_pos": self.data.qpos[:3].copy(),
"base_vel": self.data.qvel[:3].copy(),
"left_arm_joints": self.data.qpos[3:10].copy(),
"right_arm_joints": self.data.qpos[10:17].copy(),
"object_pos": self._get_object_positions(),
"image": self._render_camera("head_camera"),
}
Training Mobile Manipulation Policy
End-to-End Approach
from lerobot.common.policies.act.configuration_act import ACTConfig
from lerobot.common.policies.act.modeling_act import ACTPolicy
def create_mobile_manip_policy():
"""Create policy for mobile manipulation.
Input: cameras + base state + arm states
Output: base velocity + arm joint targets
"""
config = ACTConfig(
input_shapes={
"observation.images.head": [3, 480, 640],
"observation.images.left_wrist": [3, 480, 640],
"observation.images.right_wrist": [3, 480, 640],
"observation.state": [19], # 3 base + 2x(6+1) arms
},
output_shapes={
"action": [16], # 2 base vel + 2x(6+1) arms
},
input_normalization_modes={
"observation.images.head": "mean_std",
"observation.images.left_wrist": "mean_std",
"observation.images.right_wrist": "mean_std",
"observation.state": "min_max",
},
output_normalization_modes={"action": "min_max"},
chunk_size=50,
n_action_steps=50,
dim_model=512,
n_heads=8,
n_layers=4,
use_vae=True,
latent_dim=32,
kl_weight=10.0,
vision_backbone="resnet18",
)
return ACTPolicy(config)
Example: Navigate, Pick, and Deliver
class PickAndDeliverTask:
"""Complete task: Robot navigates to table, picks object, delivers elsewhere.
Phases:
1. Navigate to source table
2. Align with table
3. Pick up object
4. Navigate to destination
5. Align with destination
6. Place object
"""
def __init__(self, source_pos, dest_pos, object_name):
self.source_pos = np.array(source_pos)
self.dest_pos = np.array(dest_pos)
self.object_name = object_name
self.phase = "navigate_to_source"
def get_reward(self, robot_state, object_state):
"""Compute reward based on current phase."""
robot_pos = robot_state[:2]
object_pos = object_state[:3]
gripper_closed = robot_state[-1] < 0.5
if self.phase == "navigate_to_source":
dist = np.linalg.norm(robot_pos - self.source_pos[:2])
reward = -dist
if dist < 0.3:
self.phase = "pick"
reward += 1.0
elif self.phase == "pick":
if gripper_closed and object_pos[2] > 0.1:
self.phase = "navigate_to_dest"
reward = 5.0
else:
dist_to_obj = np.linalg.norm(robot_pos - object_pos[:2])
reward = -dist_to_obj
elif self.phase == "navigate_to_dest":
dist = np.linalg.norm(robot_pos - self.dest_pos[:2])
reward = -dist
if dist < 0.3:
self.phase = "place"
reward += 1.0
elif self.phase == "place":
dist_to_target = np.linalg.norm(object_pos[:2] - self.dest_pos[:2])
if dist_to_target < 0.05 and not gripper_closed:
reward = 10.0
else:
reward = -dist_to_target
return reward
task = PickAndDeliverTask(
source_pos=[2.0, 1.0, 0.8],
dest_pos=[4.0, 3.0, 0.8],
object_name="coffee_mug",
)
Whole-Body Control
class WholeBodyController:
"""Whole-body control for mobile manipulation.
Coordinates base + arm movement to reach end-effector target
while avoiding obstacles and respecting joint limits.
"""
def __init__(self, robot_model):
self.model = robot_model
self.base_weight = 0.3
self.arm_weight = 0.7
self.obstacle_threshold = 0.5
def compute_action(self, current_state, ee_target, obstacles=None):
"""Compute whole-body action to reach end-effector target."""
current_ee = self._forward_kinematics(current_state)
ee_error = ee_target - current_ee
J_arm = self._compute_arm_jacobian(current_state)
J_base = self._compute_base_jacobian(current_state)
J_wb = np.hstack([J_base, J_arm])
W = np.diag([self.base_weight] * 2 + [self.arm_weight] * 14)
J_pinv = np.linalg.pinv(J_wb @ W)
action = W @ J_pinv @ ee_error
if obstacles:
repulsion = self._compute_obstacle_repulsion(current_state, obstacles)
action[:2] += repulsion
action[:2] = np.clip(action[:2], -0.5, 0.5)
action[2:] = np.clip(action[2:], -1.0, 1.0)
return action
def _compute_obstacle_repulsion(self, state, obstacles):
"""Compute repulsion force from obstacles."""
robot_pos = state[:2]
repulsion = np.zeros(2)
for obs_pos in obstacles:
diff = robot_pos - obs_pos[:2]
dist = np.linalg.norm(diff)
if dist < self.obstacle_threshold:
force = (1.0 / dist - 1.0 / self.obstacle_threshold) / dist**2
repulsion += force * diff / dist
return repulsion * 0.1
Reference Papers
- Mobile ALOHA: Learning Bimanual Mobile Manipulation with Low-Cost Whole-Body Teleoperation — Fu et al., 2024 — Gold standard for mobile manipulation research
- HomeRobot: Open-Vocabulary Mobile Manipulation — Yenamandra et al., NeurIPS 2023 — Benchmark for household mobile manipulation
- OK-Robot: What Really Matters in Integrating Open-Knowledge Models for Robotics — Liu et al., 2024 — End-to-end mobile manipulation system
Conclusion and Next Steps
Mobile manipulation unlocks the ability to operate in real spaces, not limited by fixed workspaces. Key challenges:
- Large action space (16D+) needs sufficiently powerful models
- Navigation-manipulation coordination knowing when to move vs. when to manipulate
- Safety avoiding collisions during combined movement + manipulation
The next post — Humanoid Manipulation: Whole-body Control — takes us to the pinnacle: humanoid robots with dozens of DOF, the greatest challenge in current robotics.
Related Posts
- Bimanual Tasks — Foundation for mobile bimanual
- Quadruped Locomotion — Navigation fundamentals
- Sim-to-Real Pipeline — Deploying mobile policies to real robots