Introduction: Two Hands in Action
In the previous post, we set up and calibrated a dual-arm system. Now it's time to put it to work with practical bimanual tasks. These are tasks that single-arm robots simply cannot do — and the very reason dual-arm robots are attracting enormous attention in the robotics community.
This post will guide you through training bimanual policies for three classic tasks: towel folding, water pouring, and assembly. We'll also compare the performance of ACT and Diffusion Policy on bimanual tasks, analyze failure modes, and provide solutions.
Task 1: Towel Folding
Towel folding is a classic benchmark for bimanual manipulation because it requires:
- Coordination: Both hands must move synchronously
- Deformable object handling: Towels are soft, hard to predict
- Precision: Towel edges must align properly
Towel Folding Task Setup
import numpy as np
from dataclasses import dataclass
from typing import List
@dataclass
class TowelFoldingTask:
"""Configuration for towel folding task.
Phases:
1. Approach: Both hands approach 2 corners
2. Grasp: Firmly grip 2 corners
3. Lift: Lift the towel
4. Fold: Fold in half (left folds to right or vice versa)
5. Release: Release the folded towel
"""
towel_size: tuple = (0.3, 0.3) # 30x30 cm
towel_position: np.ndarray = None
fold_type: str = "half" # "half", "quarter", "triangle"
def __post_init__(self):
if self.towel_position is None:
self.towel_position = np.array([0.3, 0.0, 0.01])
@property
def corner_positions(self) -> dict:
"""Positions of 4 towel corners."""
cx, cy, cz = self.towel_position
w, h = self.towel_size
return {
"top_left": np.array([cx - w/2, cy + h/2, cz]),
"top_right": np.array([cx + w/2, cy + h/2, cz]),
"bottom_left": np.array([cx - w/2, cy - h/2, cz]),
"bottom_right": np.array([cx + w/2, cy - h/2, cz]),
}
def get_grasp_points(self) -> tuple:
"""Return 2 grasp points for the fold type."""
corners = self.corner_positions
if self.fold_type == "half":
return corners["top_left"], corners["top_right"]
elif self.fold_type == "triangle":
return corners["top_left"], corners["bottom_right"]
return corners["top_left"], corners["top_right"]
def collect_towel_folding_data(robot, dataset, num_episodes=100):
"""Collect data for towel folding task.
Tips:
- Use thin, square towel, not too large (30x30cm)
- Lay towel flat before each episode
- Fold slowly and smoothly for easier policy learning
- Record at least 100 episodes
"""
task = TowelFoldingTask()
for ep in range(num_episodes):
print(f"\nEpisode {ep+1}/{num_episodes}")
print("Lay towel flat, press Enter to start...")
input()
step = 0
recording = True
while recording:
obs = robot.get_observation()
left_target = robot.leader_arms["left"].read("Present_Position")
right_target = robot.leader_arms["right"].read("Present_Position")
robot.follower_arms["left"].write("Goal_Position", left_target)
robot.follower_arms["right"].write("Goal_Position", right_target)
left_state = robot.follower_arms["left"].read("Present_Position")
right_state = robot.follower_arms["right"].read("Present_Position")
dataset.add_frame({
"observation.images.top": obs["top_camera"],
"observation.images.left_wrist": obs["left_wrist_camera"],
"observation.images.right_wrist": obs["right_wrist_camera"],
"observation.state": np.concatenate([left_state, right_state]),
"action": np.concatenate([left_target, right_target]),
})
step += 1
if step > 400:
recording = False
dataset.save_episode()
return dataset
Training Bimanual Folding Policy
from lerobot.common.policies.act.configuration_act import ACTConfig
from lerobot.common.policies.act.modeling_act import ACTPolicy
import torch
def train_folding_policy(dataset_repo_id, num_epochs=300):
"""Train ACT policy for towel folding."""
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
dataset = LeRobotDataset(dataset_repo_id)
config = ACTConfig(
input_shapes={
"observation.images.top": [3, 480, 640],
"observation.images.left_wrist": [3, 480, 640],
"observation.images.right_wrist": [3, 480, 640],
"observation.state": [14],
},
output_shapes={"action": [14]},
input_normalization_modes={
"observation.images.top": "mean_std",
"observation.images.left_wrist": "mean_std",
"observation.images.right_wrist": "mean_std",
"observation.state": "min_max",
},
output_normalization_modes={"action": "min_max"},
# Folding needs large chunks for smooth trajectories
chunk_size=100,
n_action_steps=100,
dim_model=512,
n_heads=8,
n_layers=4,
use_vae=True,
latent_dim=32,
kl_weight=10.0,
)
policy = ACTPolicy(config)
device = torch.device("cuda")
policy.to(device)
optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-5, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=num_epochs, eta_min=1e-6
)
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=8, shuffle=True, num_workers=4
)
best_loss = float('inf')
for epoch in range(num_epochs):
epoch_loss = 0
n_batches = 0
for batch in dataloader:
batch = {k: v.to(device) for k, v in batch.items()}
output = policy.forward(batch)
loss = output["loss"]
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.0)
optimizer.step()
epoch_loss += loss.item()
n_batches += 1
scheduler.step()
avg_loss = epoch_loss / n_batches
if avg_loss < best_loss:
best_loss = avg_loss
torch.save(policy.state_dict(), "best_folding_policy.pt")
if (epoch + 1) % 25 == 0:
print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | "
f"Best: {best_loss:.4f}")
return policy
Task 2: Water Pouring
Pouring requires precise temporal coordination — one hand holds the cup while the other tilts the pitcher:
@dataclass
class PouringTask:
"""Configuration for water pouring task.
Right arm: Holds pitcher
Left arm: Holds cup
Phases:
1. Grasp pitcher (right) and cup (left)
2. Lift both
3. Align pitcher above cup
4. Tilt pitcher gradually (30 -> 60 -> 75 degrees)
5. Return pitcher upright
6. Place both down
"""
pitcher_volume_ml: float = 500
cup_capacity_ml: float = 250
pour_angle_deg: float = 75
pour_speed: float = 0.5
def get_pour_trajectory(self, n_steps=100):
"""Generate trajectory for pouring phase."""
up_steps = n_steps // 2
tilt_up = np.linspace(0, np.deg2rad(self.pour_angle_deg), up_steps)
down_steps = n_steps - up_steps
tilt_down = np.linspace(np.deg2rad(self.pour_angle_deg), 0, down_steps)
return np.concatenate([tilt_up, tilt_down])
def evaluate_pouring(policy, env, n_episodes=30):
"""Evaluate pouring policy.
Metrics:
- Pour accuracy: Water in cup / total water poured
- Spill rate: Water spilled outside
- Cup stability: Whether cup was dropped
"""
results = {
"pour_accuracy": [],
"spill_count": 0,
"cup_dropped": 0,
"success": 0,
}
for ep in range(n_episodes):
obs, info = env.reset()
done = False
while not done:
action = policy.predict(obs)
obs, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
water_in_cup = info.get("water_in_cup", 0)
water_spilled = info.get("water_spilled", 0)
total_water = water_in_cup + water_spilled
if total_water > 0:
accuracy = water_in_cup / total_water
results["pour_accuracy"].append(accuracy)
if info.get("cup_dropped", False):
results["cup_dropped"] += 1
if water_spilled > 10:
results["spill_count"] += 1
if accuracy > 0.9 and not info.get("cup_dropped", False):
results["success"] += 1
avg_accuracy = np.mean(results["pour_accuracy"]) if results["pour_accuracy"] else 0
print(f"Success rate: {results['success']/n_episodes:.1%}")
print(f"Pour accuracy: {avg_accuracy:.1%}")
return results
Task 3: Assembly
Assembly tasks require contact coordination — one hand holds, the other manipulates:
@dataclass
class AssemblyTask:
"""Configuration for assembly task.
Example: Screwing a bottle cap
Left arm: Holds bottle
Right arm: Screws cap
"""
task_type: str = "screw_cap"
def get_subtasks(self) -> List[str]:
if self.task_type == "screw_cap":
return [
"grasp_bottle_left",
"grasp_cap_right",
"align_cap_to_bottle",
"screw_clockwise",
"verify_tight",
"release_both",
]
elif self.task_type == "peg_in_hole":
return [
"grasp_base_left",
"grasp_peg_right",
"align_peg_to_hole",
"insert_peg",
"verify_inserted",
"release_both",
]
return []
def train_assembly_policy(dataset_repo_id):
"""Train policy for assembly task.
Assembly needs:
- Smaller chunk_size (high precision)
- Wrist cameras are critical (close-up contact)
- Force feedback if available
"""
from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig
from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy
config = DiffusionConfig(
input_shapes={
"observation.images.top": [3, 480, 640],
"observation.images.left_wrist": [3, 480, 640],
"observation.images.right_wrist": [3, 480, 640],
"observation.state": [14],
},
output_shapes={"action": [14]},
input_normalization_modes={
"observation.images.top": "mean_std",
"observation.images.left_wrist": "mean_std",
"observation.images.right_wrist": "mean_std",
"observation.state": "min_max",
},
output_normalization_modes={"action": "min_max"},
num_inference_steps=50,
down_dims=[256, 512, 1024],
n_obs_steps=2,
horizon=16,
n_action_steps=8,
noise_scheduler_type="DDIM",
vision_backbone="resnet18",
crop_shape=[84, 84],
)
policy = DiffusionPolicy(config)
return policy
ACT vs Diffusion Policy for Bimanual: Comparison
Benchmark Results (Typical)
| Task | ACT | Diffusion Policy | Notes |
|---|---|---|---|
| Towel Folding | 65% | 72% | Diffusion better with deformable |
| Pouring | 78% | 75% | ACT better with smooth trajectories |
| Assembly (cap) | 55% | 68% | Diffusion better with contact-rich |
| Cube Transfer | 85% | 82% | ACT better with simple tasks |
| Inference | 8ms | 25ms (DDIM) | ACT 3x faster |
Failure Modes and Fixes
Common Bimanual Failure Modes
BIMANUAL_FAILURE_MODES = {
"temporal_desync": {
"description": "Arms are not synchronized — one moves faster",
"frequency": "30% of failures",
"fix": [
"Increase chunk_size for longer planning horizon",
"Add temporal sync loss penalty",
"Collect data with more consistent speed",
],
},
"grasp_slip": {
"description": "One arm drops object mid-task",
"frequency": "25% of failures",
"fix": [
"Add force observation if sensor available",
"Train separate grasp maintenance policy",
"Increase gripper action frequency",
],
},
"contact_collision": {
"description": "Arms collide with each other",
"frequency": "20% of failures",
"fix": [
"Add self-collision penalty during training",
"Use workspace separation constraints",
"Collect data more carefully, avoiding collisions",
],
},
"wrong_sequence": {
"description": "Sub-tasks executed in wrong order",
"frequency": "15% of failures",
"fix": [
"Use hierarchical policy (post 5)",
"Add sub-task indicator in observation",
"Curriculum learning from simple to complex",
],
},
"overshoot": {
"description": "Excessive movement — pours too much, folds too hard",
"frequency": "10% of failures",
"fix": [
"Reduce action magnitude during training",
"Use action smoothing (EMA)",
"Add boundary constraints",
],
},
}
def analyze_failures(episodes, success_threshold=0.8):
"""Analyze failure modes from evaluation episodes."""
failures = {mode: 0 for mode in BIMANUAL_FAILURE_MODES}
total_failures = 0
for ep in episodes:
if ep["success_rate"] < success_threshold:
total_failures += 1
if ep.get("arm_desync", 0) > 0.1:
failures["temporal_desync"] += 1
if ep.get("grasp_lost", False):
failures["grasp_slip"] += 1
if ep.get("self_collision", False):
failures["contact_collision"] += 1
print(f"Total failures: {total_failures}/{len(episodes)}")
for mode, count in sorted(failures.items(), key=lambda x: -x[1]):
if count > 0:
pct = count / total_failures * 100
print(f" {mode}: {count} ({pct:.0f}%)")
Temporal Synchronization
class TemporalSyncModule:
"""Module for synchronizing actions between 2 arms.
Ensures left and right arms move synchronously,
especially critical for contact-rich tasks.
"""
def __init__(self, sync_weight=0.1):
self.sync_weight = sync_weight
def compute_sync_loss(self, left_actions, right_actions):
"""Compute sync loss between arms.
Penalizes when velocity difference is too large.
"""
left_vel = left_actions[:, 1:] - left_actions[:, :-1]
right_vel = right_actions[:, 1:] - right_actions[:, :-1]
left_speed = torch.norm(left_vel, dim=-1)
right_speed = torch.norm(right_vel, dim=-1)
speed_ratio = (left_speed + 1e-6) / (right_speed + 1e-6)
sync_loss = torch.mean((speed_ratio - 1.0) ** 2)
return self.sync_weight * sync_loss
Reference Papers
- ALOHA 2: An Enhanced Low-Cost Hardware for Bimanual Teleoperation — Aldaco et al., 2024 — Hardware upgrades and results
- Bi-KVIL: Keypoints-based Visual Imitation Learning for Bimanual Manipulation — Grotz et al., CoRL 2024 — Keypoint-based bimanual approach
- ACT: Learning Fine-Grained Bimanual Manipulation — Zhao et al., RSS 2023 — Foundation paper for bimanual ACT
Conclusion and Next Steps
Bimanual manipulation unlocks tasks that single-arm robots cannot perform. Key insights:
- Towel folding: Deformable objects need more data, Diffusion Policy often better
- Pouring: Temporal precision favors ACT with smooth chunks
- Assembly: Contact-rich tasks favor Diffusion Policy + wrist cameras
- Failure analysis matters more than just looking at success rates
The next post — Mobile Manipulation — adds a new dimension: movement. The robot not only manipulates but must also navigate + manipulate.
Related Posts
- Dual-Arm Setup & Calibration — Setup before running tasks
- Bimanual Manipulation Overview — Bimanual theory overview
- Long-Horizon Tasks — Planning for multi-step tasks