Locomotion đơn thuần chỉ là nửa bài toán. Humanoid robot hữu ích khi nó có thể đi và làm việc đồng thời — bê hộp từ kho ra xe, đẩy xe hàng trong nhà máy, mở cửa và đi qua. Đây là bài toán loco-manipulation: kết hợp locomotion quality với task completion trong một policy duy nhất.
Tiếp nối từ H1-2 loco-manipulation basics, bài này đi sâu vào các task phức tạp và chiến lược training cho chúng.
Tại sao Loco-Manipulation khó hơn sum of parts?
Challenges
| Challenge | Giải thích |
|---|---|
| Dynamic CoM shift | Khi bê vật 5kg, CoM dịch chuyển → locomotion policy bị invalidate |
| Multi-objective | Vừa giữ balance, vừa track velocity, vừa hold object, vừa reach target |
| Contact richness | Feet-ground + hands-object + object-environment = nhiều contact modes |
| Temporal coupling | Phải phối hợp timing: bước → vươn tay → nắm → nhấc → bước tiếp |
| Observation complexity | Object pose, contact forces, task state + locomotion state |
Hierarchical vs End-to-End
# === Approach 1: Hierarchical (dễ hơn, modular) ===
class HierarchicalLocoManip:
"""
High-level task planner → Mid-level: loco command + arm command
→ Low-level: joint torques
"""
def forward(self, task_obs):
# Task planner outputs: walk_target + arm_target
walk_cmd, arm_cmd = self.task_planner(task_obs)
# Locomotion policy (pre-trained)
leg_actions = self.loco_policy(self.loco_obs, walk_cmd)
# Arm policy
arm_actions = self.arm_policy(self.arm_obs, arm_cmd)
return torch.cat([leg_actions, arm_actions], dim=1)
# === Approach 2: End-to-End (khó hơn, tiềm năng cao hơn) ===
class EndToEndLocoManip:
"""
Single policy: full observation → all joint actions.
"""
def forward(self, full_obs):
# One network controls everything
all_actions = self.policy(full_obs)
return all_actions
| Hierarchical | End-to-End | |
|---|---|---|
| Pros | Modular, reuse loco policy, dễ debug | Optimal coordination, emergent behaviors |
| Cons | Sub-optimal coupling, hand-designed interface | Cần nhiều data, khó train, khó debug |
| Training time | ~5h | ~20h+ |
| Phù hợp cho | Production, clear task structure | Research, complex interactions |
Task 1: Carry Box While Walking
import torch
class CarryBoxTask:
"""
Task: nhặt hộp từ bàn, bê đến vị trí đích, đặt xuống.
"""
def __init__(self, num_envs, device="cuda"):
self.num_envs = num_envs
self.device = device
# Box properties
self.box_size = [0.3, 0.3, 0.2] # m
self.box_mass_range = [1.0, 8.0] # kg
# Task phases
self.APPROACH = 0
self.GRASP = 1
self.CARRY = 2
self.PLACE = 3
def reset(self, env_ids):
n = len(env_ids)
# Randomize box mass
self.box_mass = torch.empty(n, device=self.device).uniform_(
self.box_mass_range[0], self.box_mass_range[1]
)
# Box start: on table (0.8m height), 1-2m away
self.box_start = torch.zeros(n, 3, device=self.device)
self.box_start[:, 0] = torch.empty(n, device=self.device).uniform_(1, 2)
self.box_start[:, 2] = 0.8 # table height
# Place target: 3-5m away
self.place_target = torch.zeros(n, 3, device=self.device)
self.place_target[:, 0] = torch.empty(n, device=self.device).uniform_(3, 5)
self.place_target[:, 2] = 0.8 # target table
# Phase tracking
self.phase = torch.zeros(n, dtype=torch.long, device=self.device)
def compute_observation(self, robot_state, box_state):
"""Task-specific observations."""
obs = []
# Box relative position (trong robot frame)
box_rel = box_state["position"] - robot_state["base_pos"]
obs.append(box_rel) # (3,)
# Box orientation
obs.append(box_state["orientation"]) # (4,)
# Place target relative
target_rel = self.place_target - robot_state["base_pos"]
obs.append(target_rel) # (3,)
# Current phase (one-hot)
phase_onehot = torch.nn.functional.one_hot(
self.phase, num_classes=4
).float()
obs.append(phase_onehot) # (4,)
# Grasp status
obs.append(robot_state["grasp_force"].unsqueeze(1)) # (1,)
# Hand-box distance
for hand in ["left", "right"]:
hand_pos = robot_state[f"{hand}_hand_pos"]
dist = torch.norm(hand_pos - box_state["position"], dim=1)
obs.append(dist.unsqueeze(1)) # (1,) x2
return torch.cat(obs, dim=1) # Total: 3+4+3+4+1+2 = 17
def compute_reward(self, robot_state, box_state, action):
"""Multi-phase reward."""
rewards = {}
# Phase transitions
hand_box_dist = torch.min(
torch.norm(robot_state["left_hand_pos"] - box_state["position"], dim=1),
torch.norm(robot_state["right_hand_pos"] - box_state["position"], dim=1),
)
# === Phase 0: APPROACH ===
approach_mask = (self.phase == self.APPROACH).float()
robot_box_dist = torch.norm(
robot_state["base_pos"][:, :2] - self.box_start[:, :2], dim=1
)
rewards["approach"] = 0.5 * approach_mask * torch.exp(
-robot_box_dist / 1.0
)
# Transition: close enough to box
close_to_box = (robot_box_dist < 0.5) & (self.phase == self.APPROACH)
self.phase[close_to_box] = self.GRASP
# === Phase 1: GRASP ===
grasp_mask = (self.phase == self.GRASP).float()
rewards["reach"] = 0.8 * grasp_mask * torch.exp(
-hand_box_dist / 0.1
)
# Transition: hands touching box
grasped = (hand_box_dist < 0.05) & (self.phase == self.GRASP)
self.phase[grasped] = self.CARRY
# === Phase 2: CARRY ===
carry_mask = (self.phase == self.CARRY).float()
box_target_dist = torch.norm(
box_state["position"][:, :2] - self.place_target[:, :2], dim=1
)
rewards["carry_progress"] = 1.0 * carry_mask * torch.exp(
-box_target_dist / 2.0
)
# Keep box stable while carrying
box_tilt = torch.norm(box_state["angular_velocity"], dim=1)
rewards["box_stability"] = -0.3 * carry_mask * box_tilt
# Keep box at correct height
box_height_error = torch.abs(box_state["position"][:, 2] - 0.9)
rewards["box_height"] = -0.5 * carry_mask * box_height_error
# Transition: box near target
near_target = (box_target_dist < 0.3) & (self.phase == self.CARRY)
self.phase[near_target] = self.PLACE
# === Phase 3: PLACE ===
place_mask = (self.phase == self.PLACE).float()
place_dist = torch.norm(
box_state["position"] - self.place_target, dim=1
)
rewards["place"] = 2.0 * place_mask * torch.exp(
-place_dist / 0.05
)
# Bonus for completing task
task_complete = (place_dist < 0.1) & (self.phase == self.PLACE)
rewards["completion_bonus"] = 10.0 * task_complete.float()
total = sum(rewards.values())
return total, rewards
Task 2: Push Cart
class PushCartTask:
"""
Task: đẩy xe hàng từ A đến B.
Challenge: maintain contact while walking.
"""
def __init__(self):
self.cart_friction = 0.3
self.cart_mass = 15.0 # kg
def compute_reward(self, robot_state, cart_state, target_pos):
rewards = {}
# 1. Cart progress toward target
cart_target_dist = torch.norm(
cart_state["position"][:, :2] - target_pos[:, :2], dim=1
)
rewards["cart_progress"] = 1.0 * torch.exp(-cart_target_dist / 3.0)
# 2. Maintain hand-cart contact
# Both hands should be on cart handle
left_contact = robot_state["left_hand_contact_force"]
right_contact = robot_state["right_hand_contact_force"]
contact_reward = torch.min(left_contact, right_contact)
rewards["contact"] = 0.3 * torch.clamp(contact_reward, max=1.0)
# 3. Cart heading alignment
# Cart should face same direction as movement
cart_heading = cart_state["heading"]
target_heading = torch.atan2(
target_pos[:, 1] - cart_state["position"][:, 1],
target_pos[:, 0] - cart_state["position"][:, 0],
)
heading_error = torch.abs(cart_heading - target_heading)
rewards["heading"] = 0.2 * torch.exp(-heading_error / 0.3)
# 4. Smooth pushing (no jerky cart motion)
cart_accel = torch.norm(cart_state["acceleration"][:, :2], dim=1)
rewards["smooth_push"] = -0.1 * cart_accel
total = sum(rewards.values())
return total, rewards
Task 3: Open Door + Walk Through
class OpenDoorTask:
"""
Task phức tạp nhất: mở cửa + đi qua.
Yêu cầu coordination chính xác giữa arm và legs.
"""
def __init__(self):
self.door_width = 0.9 # m
self.handle_height = 1.0 # m
self.door_mass = 8.0 # kg
# Phases
self.APPROACH_DOOR = 0
self.REACH_HANDLE = 1
self.TURN_HANDLE = 2
self.PUSH_DOOR = 3
self.WALK_THROUGH = 4
def compute_reward(self, robot_state, door_state):
rewards = {}
# Phase 0: Walk to door
approach_mask = (self.phase == self.APPROACH_DOOR).float()
door_dist = torch.norm(
robot_state["base_pos"][:, :2] - door_state["handle_pos"][:, :2],
dim=1
)
rewards["approach"] = 0.3 * approach_mask * torch.exp(
-door_dist / 1.0
)
# Phase 1: Reach handle
reach_mask = (self.phase == self.REACH_HANDLE).float()
hand_handle_dist = torch.norm(
robot_state["right_hand_pos"] - door_state["handle_pos"],
dim=1
)
rewards["reach_handle"] = 0.5 * reach_mask * torch.exp(
-hand_handle_dist / 0.05
)
# Phase 2: Turn handle (rotate wrist)
turn_mask = (self.phase == self.TURN_HANDLE).float()
handle_angle = door_state["handle_angle"] # 0 → pi/4 = fully turned
rewards["turn_handle"] = 0.5 * turn_mask * (
handle_angle / (torch.pi / 4)
)
# Phase 3: Push door open
push_mask = (self.phase == self.PUSH_DOOR).float()
door_angle = door_state["door_angle"] # 0 → pi/2 = fully open
rewards["push_door"] = 1.0 * push_mask * (
door_angle / (torch.pi / 2)
)
# Phase 4: Walk through
walk_mask = (self.phase == self.WALK_THROUGH).float()
through_door = robot_state["base_pos"][:, 0] > door_state["frame_pos"][:, 0] + 0.5
rewards["walk_through"] = 5.0 * walk_mask * through_door.float()
# === Locomotion quality (always active) ===
# Giữ balance trong mọi phase
upright = torch.sum(
torch.square(robot_state["projected_gravity"][:, :2]), dim=1
)
rewards["upright"] = -1.5 * upright
rewards["termination"] = -200.0 * robot_state["terminated"].float()
total = sum(rewards.values())
return total, rewards
Full Training Pipeline cho Loco-Manipulation
class LocoManipTrainer:
"""
End-to-end training pipeline cho loco-manipulation tasks.
"""
def __init__(self, env, task, device="cuda"):
self.env = env
self.task = task
self.device = device
# Policy: larger network for whole-body control
import torch.nn as nn
self.policy = nn.Sequential(
nn.Linear(78 + 17, 512), # loco_obs + task_obs
nn.ELU(),
nn.Linear(512, 512),
nn.ELU(),
nn.Linear(512, 256),
nn.ELU(),
nn.Linear(256, 35), # all joints (legs + arms + hands)
).to(device)
self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=3e-4)
def train(self, num_iterations=20000):
"""Full training loop."""
for iteration in range(num_iterations):
# Collect experience
obs = self.env.get_observations()
task_obs = self.task.compute_observation(
self.env.robot_state, self.env.object_state
)
full_obs = torch.cat([obs, task_obs], dim=1)
# Policy inference
actions = self.policy(full_obs)
# Step environment
next_obs, base_reward, done, info = self.env.step(actions)
# Task reward
task_reward, task_reward_terms = self.task.compute_reward(
self.env.robot_state, self.env.object_state, actions
)
# Combined reward
total_reward = base_reward + task_reward
# PPO update (simplified)
loss = -total_reward.mean()
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1.0)
self.optimizer.step()
if iteration % 500 == 0:
success_rate = info.get("task_success_rate", 0)
print(f"Iter {iteration}: reward={total_reward.mean():.2f}, "
f"success={success_rate:.1%}")
Để hiểu thêm về locomotion nền tảng, xem bài Locomotion cơ bản. Về RL basics, xem RL cho Robotics.
Tổng kết
Loco-manipulation là bước tiến quan trọng từ locomotion thuần túy:
- Multi-phase tasks (approach → grasp → carry → place) yêu cầu reward design phức tạp
- Dynamic CoM shift khi bê vật → locomotion policy phải adapt online
- Hierarchical vs end-to-end: hierarchical dễ hơn cho production, end-to-end tiềm năng hơn
- 3 task examples: carry box, push cart, open door — mỗi task có challenges riêng
- Training time: 10-20h cho loco-manipulation vs 1-2h cho locomotion thuần
Bài tiếp theo — Humanoid Parkour — sẽ thử thách với jumping, climbing, và obstacle courses.
Tài liệu tham khảo
- HumanPlus: Humanoid Shadowing and Imitation from Humans — Fu et al., 2024
- OmniH2O: Universal and Dexterous Human-to-Humanoid Whole-Body Teleoperation and Learning — He et al., 2024
- Humanoid Loco-Manipulation with Learned Whole-Body Control — Chen et al., CoRL 2024
- Learning Whole-Body Manipulation for Quadrupedal Robot — Ha et al., 2024