Mobile Manipulation: Base di chuyển + Arms trên Mobile Robot

Giới thiệu: Robot phải biết di chuyển

Tất cả tasks trong series đến giờ đều giả định robot đứng yên một chỗ và workspace nằm trong tầm với. Nhưng thế giới thực không như vậy — robot gia đình cần đi đến bếp, mở tủ, lấy đồ, mang về bàn. Robot nhà kho cần navigate giữa các kệ, gắp hàng, và mang đến trạm đóng gói.

Đây là mobile manipulation — kết hợp di chuyển (navigation) với thao tác (manipulation). Bài viết này — bài thứ 8 trong series VLA & LeRobot Mastery — sẽ khám phá cách mở rộng hệ thống từ fixed-base sang mobile, từ các thách thức action space đến whole-body coordination.

Kiến trúc Mobile Manipulation

Action space mở rộng

Chuyển từ fixed-base sang mobile base thêm degrees of freedom đáng kể:

Hệ thống	Action Dimensions	Chi tiết
Fixed single-arm	7	6 joints + 1 gripper
Fixed dual-arm	14	2 × (6 joints + 1 gripper)
Mobile single-arm	9-10	2-3 base + 6 joints + 1 gripper
Mobile dual-arm	16-17	2-3 base + 2 × (6 joints + 1 gripper)
Mobile ALOHA	16	2 base vel + 2 × (6 joints + 1 gripper)

import numpy as np
from dataclasses import dataclass

@dataclass
class MobileManipActionSpace:
    """Action space cho mobile manipulation robot.
    
    Base: [v_x, v_y, omega] hoặc [v_linear, omega] (differential drive)
    Left arm: [j1, j2, j3, j4, j5, j6, gripper]
    Right arm: [j1, j2, j3, j4, j5, j6, gripper]
    """
    base_type: str = "holonomic"  # "holonomic" (3D) or "differential" (2D)
    n_arms: int = 2
    joints_per_arm: int = 6
    
    @property
    def base_dim(self):
        return 3 if self.base_type == "holonomic" else 2
    
    @property
    def arm_dim(self):
        return (self.joints_per_arm + 1) * self.n_arms  # +1 cho gripper
    
    @property
    def total_dim(self):
        return self.base_dim + self.arm_dim
    
    def split_action(self, action):
        """Tách action thành base và arm components."""
        base_action = action[:self.base_dim]
        arm_actions = action[self.base_dim:]
        
        if self.n_arms == 2:
            arm_dim_each = (self.joints_per_arm + 1)
            left_arm = arm_actions[:arm_dim_each]
            right_arm = arm_actions[arm_dim_each:]
            return base_action, left_arm, right_arm
        else:
            return base_action, arm_actions
    
    def merge_action(self, base, *arms):
        """Ghép base và arm actions."""
        return np.concatenate([base, *arms])


# Mobile ALOHA action space
action_space = MobileManipActionSpace(
    base_type="differential",  # 2D: linear + angular velocity
    n_arms=2,
    joints_per_arm=6,
)
print(f"Total action dim: {action_space.total_dim}")
# Output: Total action dim: 16

class NavManipCoordinator:
    """Phối hợp navigation và manipulation.
    
    3 chế độ:
    1. NAVIGATE: Chỉ di chuyển base, arms ở vị trí an toàn
    2. MANIPULATE: Base đứng yên, arms thao tác
    3. COORDINATED: Base và arms hoạt động đồng thời
    """
    
    def __init__(self):
        self.mode = "NAVIGATE"
        self.base_threshold = 0.3  # 30cm — khoảng cách đến target
        self.arm_safe_position = np.array([0, -1.5, 1.0, 0, 0.5, 0, 1.0])  # Tuck position
    
    def decide_mode(self, robot_pos, target_pos, task_phase):
        """Quyết định chế độ hoạt động."""
        distance = np.linalg.norm(robot_pos[:2] - target_pos[:2])
        
        if task_phase == "approach" and distance > self.base_threshold:
            return "NAVIGATE"
        elif task_phase == "manipulate":
            return "MANIPULATE"
        else:
            return "COORDINATED"
    
    def apply_mode(self, action, mode):
        """Áp dụng constraints theo mode."""
        base_action = action[:2]
        arm_action = action[2:]
        
        if mode == "NAVIGATE":
            # Arms ở vị trí an toàn, chỉ base di chuyển
            return np.concatenate([base_action, self.arm_safe_position, self.arm_safe_position])
        
        elif mode == "MANIPULATE":
            # Base đứng yên, chỉ arms hoạt động
            return np.concatenate([np.zeros(2), arm_action])
        
        else:  # COORDINATED
            # Cả base và arms hoạt động, giảm base speed
            base_action *= 0.5  # Chậm hơn khi coordinated
            return np.concatenate([base_action, arm_action])


# Sử dụng
coordinator = NavManipCoordinator()
mode = coordinator.decide_mode(
    robot_pos=np.array([0, 0, 0]),
    target_pos=np.array([2.0, 1.0, 0]),
    task_phase="approach",
)
print(f"Mode: {mode}")

Simulation Environments cho Mobile Manipulation

Habitat và AI2-THOR

# === Habitat (Meta) ===
# Tốt cho navigation + large-scale environments
import habitat
from habitat.config.default import get_agent_config

def create_habitat_env():
    """Tạo Habitat environment cho mobile manipulation."""
    config = habitat.get_config(
        "benchmark/nav/pointnav/pointnav_habitat_test.yaml"
    )
    
    # Custom cho manipulation
    config.defrost()
    config.TASK.TYPE = "RearrangeTask-v0"
    config.TASK.ACTIONS.ARM_ACTION = habitat.config.Config()
    config.TASK.ACTIONS.ARM_ACTION.TYPE = "ArmAction"
    config.TASK.ACTIONS.BASE_VELOCITY = habitat.config.Config()
    config.TASK.ACTIONS.BASE_VELOCITY.TYPE = "BaseVelAction"
    config.freeze()
    
    env = habitat.Env(config=config)
    return env


# === AI2-THOR (Allen AI) ===
# Tốt cho interactive environments (mở tủ, bật đèn, etc.)
from ai2thor.controller import Controller

def create_ai2thor_env():
    """Tạo AI2-THOR environment."""
    controller = Controller(
        scene="FloorPlan1",
        gridSize=0.25,
        renderDepthImage=True,
        renderInstanceSegmentation=True,
        width=640,
        height=480,
    )
    return controller

MuJoCo cho Mobile Manipulation

import mujoco
import numpy as np

class MuJoCoMobileManip:
    """MuJoCo environment cho mobile manipulation.
    
    Robot: Mobile base + 2 arms
    Task: Navigate đến bàn, gắp object, mang đến vị trí khác
    """
    
    def __init__(self, xml_path):
        self.model = mujoco.MjModel.from_xml_path(xml_path)
        self.data = mujoco.MjData(self.model)
        
        # Robot configuration
        self.base_joints = [0, 1]          # x, y velocity
        self.left_arm_joints = [2, 3, 4, 5, 6, 7, 8]  # 6 joints + gripper
        self.right_arm_joints = [9, 10, 11, 12, 13, 14, 15]
    
    def step(self, action):
        """Execute action và trả về observation."""
        # Split action
        base_vel = action[:2]
        left_arm = action[2:9]
        right_arm = action[9:16]
        
        # Apply base velocity
        self.data.ctrl[self.base_joints] = base_vel
        
        # Apply arm joint targets
        self.data.ctrl[self.left_arm_joints] = left_arm
        self.data.ctrl[self.right_arm_joints] = right_arm
        
        # Simulate
        mujoco.mj_step(self.model, self.data)
        
        # Get observation
        obs = self._get_obs()
        reward = self._compute_reward()
        done = self._check_done()
        
        return obs, reward, done, {}
    
    def _get_obs(self):
        """Trả về observation."""
        return {
            "base_pos": self.data.qpos[:3].copy(),       # x, y, theta
            "base_vel": self.data.qvel[:3].copy(),
            "left_arm_joints": self.data.qpos[3:10].copy(),
            "right_arm_joints": self.data.qpos[10:17].copy(),
            "object_pos": self._get_object_positions(),
            "image": self._render_camera("head_camera"),
        }

Training Mobile Manipulation Policy

End-to-end approach

from lerobot.common.policies.act.configuration_act import ACTConfig
from lerobot.common.policies.act.modeling_act import ACTPolicy

def create_mobile_manip_policy():
    """Tạo policy cho mobile manipulation.
    
    Input: cameras + base state + arm states
    Output: base velocity + arm joint targets
    """
    config = ACTConfig(
        input_shapes={
            "observation.images.head": [3, 480, 640],     # Head camera
            "observation.images.left_wrist": [3, 480, 640],
            "observation.images.right_wrist": [3, 480, 640],
            "observation.state": [19],  # 3 base + 2×(6+1) arms
        },
        output_shapes={
            "action": [16],  # 2 base vel + 2×(6+1) arms
        },
        input_normalization_modes={
            "observation.images.head": "mean_std",
            "observation.images.left_wrist": "mean_std",
            "observation.images.right_wrist": "mean_std",
            "observation.state": "min_max",
        },
        output_normalization_modes={
            "action": "min_max",
        },
        
        # Larger model cho mobile manip
        chunk_size=50,        # Shorter chunks — base needs responsive control
        n_action_steps=50,
        dim_model=512,
        n_heads=8,
        n_layers=4,
        use_vae=True,
        latent_dim=32,
        kl_weight=10.0,
        vision_backbone="resnet18",
    )
    
    return ACTPolicy(config)


def train_mobile_manip(policy, dataset, num_epochs=300):
    """Training loop cho mobile manipulation."""
    device = torch.device("cuda")
    policy.to(device)
    
    optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-5)
    dataloader = torch.utils.data.DataLoader(
        dataset, batch_size=4, shuffle=True, num_workers=4
    )
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        n_batches = 0
        
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            output = policy.forward(batch)
            loss = output["loss"]
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.0)
            optimizer.step()
            
            epoch_loss += loss.item()
            n_batches += 1
        
        if (epoch + 1) % 25 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} | Loss: {epoch_loss/n_batches:.4f}")
    
    return policy

Ví dụ: Navigate → Pick → Place

class PickAndDeliverTask:
    """Task hoàn chỉnh: Robot di chuyển đến bàn, gắp object, mang đến nơi khác.
    
    Phases:
    1. Navigate to source table
    2. Align with table
    3. Pick up object
    4. Navigate to destination
    5. Align with destination
    6. Place object
    """
    
    def __init__(self, source_pos, dest_pos, object_name):
        self.source_pos = np.array(source_pos)
        self.dest_pos = np.array(dest_pos)
        self.object_name = object_name
        self.phase = "navigate_to_source"
    
    def get_reward(self, robot_state, object_state):
        """Tính reward dựa trên phase hiện tại."""
        robot_pos = robot_state[:2]
        object_pos = object_state[:3]
        gripper_closed = robot_state[-1] < 0.5
        
        if self.phase == "navigate_to_source":
            # Reward: Gần source hơn
            dist = np.linalg.norm(robot_pos - self.source_pos[:2])
            reward = -dist
            if dist < 0.3:
                self.phase = "pick"
                reward += 1.0
        
        elif self.phase == "pick":
            # Reward: Gắp được object
            if gripper_closed and object_pos[2] > 0.1:
                self.phase = "navigate_to_dest"
                reward = 5.0
            else:
                dist_to_obj = np.linalg.norm(robot_pos - object_pos[:2])
                reward = -dist_to_obj
        
        elif self.phase == "navigate_to_dest":
            dist = np.linalg.norm(robot_pos - self.dest_pos[:2])
            reward = -dist
            if dist < 0.3:
                self.phase = "place"
                reward += 1.0
        
        elif self.phase == "place":
            dist_to_target = np.linalg.norm(object_pos[:2] - self.dest_pos[:2])
            if dist_to_target < 0.05 and not gripper_closed:
                reward = 10.0  # Task complete!
            else:
                reward = -dist_to_target
        
        return reward
    
    def is_complete(self, object_state):
        """Kiểm tra task hoàn thành."""
        dist = np.linalg.norm(object_state[:2] - self.dest_pos[:2])
        return self.phase == "place" and dist < 0.05


# Ví dụ sử dụng
task = PickAndDeliverTask(
    source_pos=[2.0, 1.0, 0.8],    # Bàn nguồn
    dest_pos=[4.0, 3.0, 0.8],       # Bàn đích
    object_name="coffee_mug",
)

Whole-Body Control

class WholeBodyController:
    """Whole-body control cho mobile manipulation.
    
    Phối hợp base movement + arm movement để đạt end-effector target
    trong khi tránh obstacles và joint limits.
    """
    
    def __init__(self, robot_model):
        self.model = robot_model
        self.base_weight = 0.3     # Ưu tiên arm hơn base
        self.arm_weight = 0.7
        self.obstacle_threshold = 0.5  # 50cm
    
    def compute_action(self, current_state, ee_target, obstacles=None):
        """Tính action cho whole-body để đạt end-effector target.
        
        Args:
            current_state: [base_pos(3), left_arm(7), right_arm(7)]
            ee_target: Target end-effector position [x, y, z, rx, ry, rz]
            obstacles: List of obstacle positions
        
        Returns:
            action: [base_vel(2), left_arm(7), right_arm(7)]
        """
        # Tính end-effector error
        current_ee = self._forward_kinematics(current_state)
        ee_error = ee_target - current_ee
        
        # Jacobian cho arm
        J_arm = self._compute_arm_jacobian(current_state)
        
        # Jacobian cho base (simple — base position ảnh hưởng trực tiếp đến EE)
        J_base = self._compute_base_jacobian(current_state)
        
        # Whole-body Jacobian
        J_wb = np.hstack([J_base, J_arm])
        
        # Pseudo-inverse với weighting
        W = np.diag([self.base_weight] * 2 + [self.arm_weight] * 14)
        J_pinv = np.linalg.pinv(J_wb @ W)
        
        # Compute joint velocities
        action = W @ J_pinv @ ee_error
        
        # Obstacle avoidance
        if obstacles:
            repulsion = self._compute_obstacle_repulsion(
                current_state, obstacles
            )
            action[:2] += repulsion  # Chỉ ảnh hưởng base
        
        # Clip actions
        action[:2] = np.clip(action[:2], -0.5, 0.5)    # Base velocity
        action[2:] = np.clip(action[2:], -1.0, 1.0)     # Arm joints
        
        return action
    
    def _compute_obstacle_repulsion(self, state, obstacles):
        """Tính lực đẩy từ obstacles."""
        robot_pos = state[:2]
        repulsion = np.zeros(2)
        
        for obs_pos in obstacles:
            diff = robot_pos - obs_pos[:2]
            dist = np.linalg.norm(diff)
            
            if dist < self.obstacle_threshold:
                # Repulsion force inversely proportional to distance
                force = (1.0 / dist - 1.0 / self.obstacle_threshold) / dist**2
                repulsion += force * diff / dist
        
        return repulsion * 0.1  # Scale factor

Papers tham khảo

Mobile ALOHA: Learning Bimanual Mobile Manipulation with Low-Cost Whole-Body Teleoperation — Fu et al., 2024 — Gold standard cho mobile manipulation research
HomeRobot: Open-Vocabulary Mobile Manipulation — Yenamandra et al., NeurIPS 2023 — Benchmark cho household mobile manipulation
OK-Robot: What Really Matters in Integrating Open-Knowledge Models for Robotics — Liu et al., 2024 — End-to-end mobile manipulation system

Kết luận và bước tiếp theo

Mobile manipulation mở ra khả năng hoạt động trong không gian thực, không bị giới hạn bởi workspace cố định. Challenges chính:

Action space lớn (16D+) — cần model đủ mạnh
Navigation-manipulation coordination — biết khi nào di chuyển, khi nào thao tác
Safety — tránh va chạm khi di chuyển + thao tác

Bài tiếp theo — Humanoid Manipulation: Whole-body Control — sẽ đưa chúng ta đến đỉnh cao nhất: robot hình người với hàng chục DOF, thách thức lớn nhất trong robotics hiện tại.