Module 4: Complete Vision-Language-Action Implementation Guide
T061-T065: Consolidated Chapters
This section covers all 5 core chapters of Module 4 in one comprehensive guide.
T061: VLA Architecture Fundamentals
What is VLA?
VLA (Vision-Language-Action) = Multimodal AI system that:
- Sees the world (vision)
- Understands language (NLP)
- Plans actions (reasoning)
- Executes on robot (control)
Architecture Pattern
Inputs:
├─ Camera (RGB/depth images)
├─ LiDAR (point clouds)
├─ Microphone (voice audio)
└─ Proprioception (joint angles)
Perception Pipeline:
├─ Image encoding (CNN features)
├─ Audio encoding (Whisper)
├─ Sensor fusion
└─ Scene understanding
Decision Module (LLM):
├─ Parse user intent
├─ Plan action sequence
├─ Generate robot commands
└─ Reason about constraints
Action Execution:
├─ Motion planning
├─ Joint control
├─ Grasp execution
└─ Navigation
End-to-End vs. Modular
End-to-End:
- Single neural network: image → action
- Pros: Simple, fast
- Cons: Black box, hard to debug
Modular (Recommended):
- Pipeline: perception → language → planning → control
- Pros: Interpretable, maintainable, reusable
- Cons: More complexity
Code Pattern
class VLASystem:
def __init__(self):
self.perception = PerceptionPipeline() # Vision + SLAM
self.language = LLMAgent() # Language understanding
self.planner = ActionPlanner() # Task planning
self.controller = RobotController() # Motion execution
def process_command(self, voice_input):
# 1. Transcribe
text = whisper.transcribe(voice_input)
# 2. Understand
intent = self.language.parse(text)
# 3. Plan
actions = self.planner.plan(intent, self.perception.state)
# 4. Execute
for action in actions:
self.controller.execute(action)
T062: Language-to-Action Mapping
LLM Prompt Engineering
Structure your prompts clearly:
System: "You are a robot control system. Convert user commands to JSON actions."
User: "Walk to the kitchen and find the coffee cup"
Response:
{
"tasks": [
{"type": "navigate", "location": "kitchen", "reason": "find coffee cup"},
{"type": "detect", "object": "coffee cup", "confirm_action": true},
{"type": "grasp", "object": "coffee cup"},
{"type": "navigate", "location": "user", "reason": "return with cup"}
],
"safety_checks": ["avoid_obstacles", "respect_joint_limits"],
"timeout_seconds": 120
}
Implementation
import json
from openai import OpenAI
class LanguageAgent:
def __init__(self, api_key):
self.client = OpenAI(api_key=api_key)
self.system_prompt = """You are a robot control system.
Convert user voice commands to JSON action plans.
Output only valid JSON, no markdown.
"""
def parse_command(self, voice_text, robot_state):
"""Parse voice command to action plan"""
user_prompt = f"""
Current robot state:
- Location: {robot_state['position']}
- Detected objects: {robot_state['objects']}
- Battery: {robot_state['battery']}%
User command: "{voice_text}"
Generate action plan JSON.
"""
response = self.client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": user_prompt}
],
temperature=0.7
)
plan_text = response.choices[0].message.content
actions = json.loads(plan_text)
return actions
T063: Voice Interface & Whisper
Whisper Speech-to-Text
import whisper
# Load model (tiny=fastest, base, small, medium, large)
model = whisper.load_model("base")
# Transcribe from file
result = model.transcribe("audio.mp3", language="en")
print(result["text"]) # "walk to the kitchen"
# Real-time from microphone
import pyaudio
import numpy as np
def record_audio(duration_seconds=5):
"""Record from microphone"""
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32, channels=1,
rate=16000, input=True, frames_per_buffer=1024)
frames = []
for _ in range(int(16000/1024 * duration_seconds)):
data = stream.read(1024)
frames.append(np.frombuffer(data, np.float32))
stream.stop_stream()
p.terminate()
return np.concatenate(frames)
audio = record_audio(5)
result = model.transcribe(audio, language="en")
ROS 2 Voice Node
#!/usr/bin/env python3
import rclpy
import whisper
from std_msgs.msg import String
class VoiceCommandNode(rclpy.node.Node):
def __init__(self):
super().__init__('voice_commander')
self.model = whisper.load_model("base")
self.publisher = self.create_publisher(String, '/voice_command', 10)
self.create_timer(0.1, self.listen_callback)
def listen_callback(self):
"""Continuously listen for voice commands"""
audio = record_audio(2) # 2 second window
result = self.model.transcribe(audio)
text = result["text"].strip()
if text:
msg = String()
msg.data = text
self.publisher.publish(msg)
if __name__ == '__main__':
rclpy.init()
node = VoiceCommandNode()
rclpy.spin(node)
T064: Sensor Feedback Loops
Closed-Loop Control
class RobotController:
def execute_with_feedback(self, action, timeout_seconds=30):
"""Execute action with vision feedback"""
start_time = time.time()
while time.time() - start_time < timeout_seconds:
# 1. Get current state from perception
state = self.perception.update()
# 2. Check if action is complete
if self.action_complete(action, state):
return True
# 3. Correct trajectory if needed
correction = self.compute_correction(action, state)
if correction:
self.send_command(correction)
time.sleep(0.1)
return False # Timeout
def action_complete(self, action, state):
"""Check if action finished"""
if action['type'] == 'grasp':
return state['gripper_closed'] and state['object_in_hand']
elif action['type'] == 'navigate':
return self.distance_to_goal(state) < 0.1 # 10cm
return False
def compute_correction(self, action, state):
"""Vision-based trajectory correction"""
if action['type'] == 'navigate':
current_pos = state['position']
target_pos = action['goal']
error = target_pos - current_pos
# PID control
correction = self.pid_controller.update(error)
return correction
return None
T065: System Integration & Deployment
Complete End-to-End Pipeline
class CapstoneHumanoidController:
def __init__(self, jetson=True):
# Initialize all modules
self.perception = SlAMNav2System() # Module 3
self.language = LanguageAgent() # Module 4: Language
self.voice = WhisperNode() # Module 4: Voice
self.controller = RobotController() # Execution
def run(self):
"""Main loop"""
rclpy.init()
while True:
# 1. Listen for voice
voice_input = self.voice.listen()
if not voice_input:
continue
# 2. Transcribe
text = whisper.transcribe(voice_input)
self.log(f"Heard: {text}")
# 3. Understand intent
state = self.perception.get_state()
actions = self.language.parse_command(text, state)
self.log(f"Plan: {actions}")
# 4. Execute with feedback
for action in actions:
success = self.controller.execute_with_feedback(action)
if not success:
self.log(f"Failed: {action}")
break
# 5. Confirm completion
self.speak(f"Completed command: {text[:50]}...")
def speak(self, text):
"""Text-to-speech feedback"""
import pyttsx3
engine = pyttsx3.init()
engine.say(text)
engine.runAndWait()
Jetson Optimization
# Quantize Whisper for faster inference
import torch
from torch.quantization import quantize_dynamic
model = whisper.load_model("base")
quantized = quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
# Result: 4x faster on Jetson, minimal accuracy loss
# Use smaller LLM on Jetson
from ollama import Client
client = Client(host='http://localhost:11434')
response = client.generate(model='llama2', prompt="Walk forward")
Performance Targets
| Component | Latency | Target |
|---|---|---|
| Whisper | 500-2000ms | Under 2s |
| LLM | 500-5000ms | Under 5s |
| Perception | 30-100ms | Under 100ms |
| Control | 10-50ms | Under 50ms |
| Total latency | 1-10 seconds | Under 10s feel responsive |
Safety Constraints
class SafetyMonitor:
def __init__(self):
self.max_velocity = 0.5 # m/s
self.max_joint_effort = 100 # Nm
self.collision_threshold = 0.2 # m
def validate_action(self, action):
"""Check safety before execution"""
if action['type'] == 'navigate':
# Check for obstacles
obstacles = self.perception.detect_obstacles()
if any(obs['distance'] < self.collision_threshold for obs in obstacles):
return False, "Obstacles detected"
return True, "Safe"
Summary
Complete VLA system:
- Whisper transcribes voice
- LLM understands intent
- Perception provides context
- Planner generates actions
- Controller executes with feedback
Integrated on Jetson for edge deployment.
Navigation
- Module 3: SLAM & Perception
- Labs: Lab 4.1: Language-to-Action