Monitoring System Usage Guide

Overview

This guide provides practical examples for using the Radium monitoring system to track agent execution, analyze costs, and monitor performance.

Basic Agent Tracking

Registering an Agent

use radium_core::monitoring::{MonitoringService, AgentRecord, AgentStatus};

// Open monitoring service (uses workspace database)
let workspace = Workspace::discover()?;
let monitoring_path = workspace.radium_dir().join("monitoring.db");
let monitoring = MonitoringService::open(monitoring_path)?;

// Create and register an agent
let agent = AgentRecord::new("agent-123".to_string(), "developer".to_string())
    .with_plan("REQ-49".to_string())
    .with_process_id(12345)
    .with_log_file("/path/to/logs/agent-123.log".to_string());

monitoring.register_agent(&agent)?;

Updating Agent Status

// Mark agent as running
monitoring.update_status("agent-123", AgentStatus::Running)?;

// Complete agent successfully
monitoring.complete_agent("agent-123", 0)?;

// Mark agent as failed
monitoring.fail_agent("agent-123", "Connection timeout")?;

Using Hooks for Async Tracking

use std::sync::Arc;
use radium_core::hooks::registry::HookRegistry;

// Create monitoring service with hooks
let hook_registry = Arc::new(HookRegistry::new());
let monitoring = MonitoringService::open_with_hooks(monitoring_path, hook_registry)?;

// Register agent with hooks (async)
monitoring.register_agent_with_hooks(&agent).await?;

// Complete agent with hooks (async)
monitoring.complete_agent_with_hooks("agent-123", 0).await?;

Telemetry Collection

Recording Telemetry

use radium_core::monitoring::{TelemetryRecord, TelemetryTracking};

// Create telemetry record
let mut telemetry = TelemetryRecord::new("agent-123".to_string())
    .with_tokens(1500, 800)
    .with_cache_stats(200, 50, 150)
    .with_model("gpt-4".to_string(), "openai".to_string())
    .with_engine_id("openai".to_string());

// Calculate cost
telemetry.calculate_cost();

// Record telemetry (async, executes hooks)
monitoring.record_telemetry(&telemetry).await?;

Parsing Telemetry from API Responses

use radium_core::monitoring::TelemetryParser;

// Parse OpenAI response
let openai_response = r#"
{
  "usage": {
    "prompt_tokens": 1500,
    "completion_tokens": 800,
    "total_tokens": 2300
  }
}
"#;

let (input_tokens, output_tokens) = TelemetryParser::parse_openai(openai_response)?;

// Parse Anthropic response
let anthropic_response = r#"
{
  "usage": {
    "input_tokens": 1500,
    "output_tokens": 800
  }
}
"#;

let (input_tokens, output_tokens) = TelemetryParser::parse_anthropic(anthropic_response)?;

Tracking Tool Execution

// Record telemetry with tool information
let mut telemetry = TelemetryRecord::new("agent-123".to_string())
    .with_tokens(100, 50)
    .with_tool_approval(
        "read_file".to_string(),
        Some(vec!["path/to/file.rs".to_string()]),
        true,
        "auto".to_string(),
    )
    .with_engine_id("claude".to_string());

telemetry.calculate_cost();
monitoring.record_telemetry(&telemetry).await?;

Querying Agent Data

Get Agent Status

// Get single agent
let agent = monitoring.get_agent("agent-123")?;
println!("Agent: {} - Status: {:?}", agent.id, agent.status);

// Get all agents for a plan
let plan_agents = monitoring.get_plan_agents("REQ-49")?;
for agent in plan_agents {
    println!("Agent: {} - Type: {}", agent.id, agent.agent_type);
}

// Get child agents
let children = monitoring.get_children("parent-agent-123")?;
println!("Found {} child agents", children.len());

// List all agents
let all_agents = monitoring.list_agents()?;
for agent in all_agents {
    println!("{}: {:?}", agent.id, agent.status);
}

Get Telemetry Data

// Get all telemetry for an agent
let telemetry = monitoring.get_agent_telemetry("agent-123")?;
for record in telemetry {
    println!(
        "Tokens: {} in, {} out, Cost: ${:.4}",
        record.input_tokens,
        record.output_tokens,
        record.estimated_cost
    );
}

// Get total tokens
let (input, output, cached) = monitoring.get_total_tokens("agent-123")?;
println!("Total: {} in, {} out, {} cached", input, output, cached);

// Get total cost
let total_cost = monitoring.get_total_cost("agent-123")?;
println!("Total cost: ${:.4}", total_cost);

CLI Usage

View Agent Status

# Show status for specific agent
rad monitor status agent-123

# Show status for all agents
rad monitor status

# JSON output
rad monitor status agent-123 --json

Example Output:

Agent: agent-123
Type: developer
Status: Running
Plan: REQ-49
Process ID: 12345
Duration: 45s

List Agents

# List all agents
rad monitor list

# Filter by status
rad monitor list --status running

# JSON output
rad monitor list --json

Example Output:

ID                            Type           Status             Plan        Duration
agent-123                     developer      Running            REQ-49      45s
agent-124                     architect      Completed          REQ-49      120s

View Telemetry

# Telemetry for specific agent
rad monitor telemetry agent-123

# Summary for all agents
rad monitor telemetry

# JSON output
rad monitor telemetry agent-123 --json

Example Output:

Telemetry for agent: agent-123
Total Cost: $0.0450

Timestamp            Input Tokens    Output Tokens   Total Tokens    Cost            Model
2025-12-07 10:30:00  1500            800             2300            $0.0450         gpt-4
2025-12-07 10:31:00  2000            1200            3200            $0.0720         gpt-4

Filtering and Querying Strategies

Filter by Status

let all_agents = monitoring.list_agents()?;
let running_agents: Vec<_> = all_agents
    .iter()
    .filter(|a| a.status == AgentStatus::Running)
    .collect();

Filter by Plan

let plan_agents = monitoring.get_plan_agents("REQ-49")?;

Filter by Time Range

use std::time::{SystemTime, UNIX_EPOCH};

let now = SystemTime::now()
    .duration_since(UNIX_EPOCH)
    .unwrap()
    .as_secs();
let one_hour_ago = now - 3600;

let recent_agents: Vec<_> = monitoring
    .list_agents()?
    .into_iter()
    .filter(|a| a.start_time >= one_hour_ago)
    .collect();

Aggregate Telemetry

// Get telemetry for multiple agents
let agents = monitoring.get_plan_agents("REQ-49")?;
let mut total_cost = 0.0;
let mut total_tokens = 0u64;

for agent in &agents {
    let cost = monitoring.get_total_cost(&agent.id)?;
    total_cost += cost;
    
    let telemetry = monitoring.get_agent_telemetry(&agent.id)?;
    for t in telemetry {
        total_tokens += t.total_tokens;
    }
}

println!("Plan REQ-49: ${:.4} total cost, {} total tokens", total_cost, total_tokens);

Log Management

Using LogManager

use radium_core::monitoring::LogManager;

// Create log manager
let logs_dir = workspace.radium_dir().join("logs");
let log_manager = LogManager::new(logs_dir)?;

// Create log file for agent
let mut log_file = log_manager.create_log("agent-123")?;
writeln!(log_file, "Agent started")?;

// Append to log
log_manager.append_log("agent-123", "Processing task 1")?;
log_manager.append_log("agent-123", "Processing task 2")?;

// Read log
let log_content = log_manager.read_log("agent-123")?;

// Tail log (last 10 lines)
let tail = log_manager.tail_log("agent-123", 10)?;

// List all logs
let all_logs = log_manager.list_logs()?;

Integration with Workflows

Automatic Agent Tracking

When using the workflow executor, agents are automatically tracked:

use radium_core::workflow::WorkflowExecutor;

// Workflow executor automatically:
// 1. Registers agents when steps start
// 2. Updates status as steps progress
// 3. Records telemetry when available
// 4. Completes agents when steps finish

Manual Integration

// In your agent execution code
let agent = AgentRecord::new(agent_id, agent_type)
    .with_plan(plan_id)
    .with_process_id(std::process::id());

monitoring.register_agent(&agent)?;
monitoring.update_status(&agent_id, AgentStatus::Running)?;

// ... execute agent work ...

// Record telemetry
let telemetry = TelemetryRecord::new(agent_id.clone())
    .with_tokens(input_tokens, output_tokens)
    .with_model(model, provider);
telemetry.calculate_cost();
monitoring.record_telemetry(&telemetry).await?;

// Complete agent
monitoring.complete_agent(&agent_id, exit_code)?;

Error Handling

Handling Monitoring Errors

use radium_core::monitoring::{MonitoringError, Result};

match monitoring.get_agent("agent-123") {
    Ok(agent) => println!("Found agent: {}", agent.id),
    Err(MonitoringError::AgentNotFound(id)) => {
        println!("Agent {} not found", id);
    }
    Err(MonitoringError::Database(e)) => {
        eprintln!("Database error: {}", e);
    }
    Err(e) => {
        eprintln!("Other error: {}", e);
    }
}

Graceful Degradation

// Monitoring failures shouldn't break agent execution
if let Err(e) = monitoring.register_agent(&agent) {
    tracing::warn!("Failed to register agent: {}", e);
    // Continue execution without monitoring
}

Best Practices

1. Always Register Agents

let agent = AgentRecord::new(agent_id, agent_type);
monitoring.register_agent(&agent)?;

2. Update Status Regularly

Keep status up-to-date for accurate monitoring:

monitoring.update_status(&agent_id, AgentStatus::Running)?;
// ... do work ...
monitoring.update_status(&agent_id, AgentStatus::Completed)?;

3. Record Telemetry After Model Calls

Record telemetry immediately after model API calls:

let (input_tokens, output_tokens) = parse_model_response(&response)?;
let mut telemetry = TelemetryRecord::new(agent_id)
    .with_tokens(input_tokens, output_tokens)
    .with_model(model, provider);
telemetry.calculate_cost();
monitoring.record_telemetry(&telemetry).await?;

4. Use Hooks for Custom Logic

// Hooks can modify telemetry, add custom fields, etc.
// See hooks documentation for details

5. Filter Queries Efficiently

Use specific queries instead of filtering in memory:

// Good: Use database query
let plan_agents = monitoring.get_plan_agents("REQ-49")?;

// Less efficient: Filter in memory
let all_agents = monitoring.list_agents()?;
let plan_agents: Vec<_> = all_agents
    .iter()
    .filter(|a| a.plan_id.as_deref() == Some("REQ-49"))
    .collect();

Common Patterns

Pattern 1: Track Agent with Parent

// Parent agent
let parent = AgentRecord::new("parent-123".to_string(), "orchestrator".to_string());
monitoring.register_agent(&parent)?;

// Child agent
let child = AgentRecord::new("child-456".to_string(), "developer".to_string())
    .with_parent("parent-123".to_string());
monitoring.register_agent(&child)?;

Pattern 2: Track Plan Execution

// All agents for a plan
let agents = monitoring.get_plan_agents("REQ-49")?;

// Calculate plan metrics
let mut total_cost = 0.0;
let mut total_duration = 0u64;

for agent in &agents {
    total_cost += monitoring.get_total_cost(&agent.id).unwrap_or(0.0);
    if let Some(end_time) = agent.end_time {
        total_duration += end_time - agent.start_time;
    }
}

Pattern 3: Monitor Tool Usage

// Get telemetry with tool information
let telemetry = monitoring.get_agent_telemetry("agent-123")?;

let tool_usage: std::collections::HashMap<String, u64> = telemetry
    .iter()
    .filter_map(|t| t.tool_name.as_ref().map(|name| (name.clone(), 1)))
    .fold(std::collections::HashMap::new(), |mut acc, (name, count)| {
        *acc.entry(name).or_insert(0) += count;
        acc
    });

for (tool, count) in tool_usage {
    println!("{}: {} executions", tool, count);
}

Troubleshooting

Database Not Found

If you get "Failed to open monitoring database", ensure you're in a Radium workspace:

rad init  # Create workspace if needed

Missing Telemetry

If telemetry is missing, check:

Are you calling record_telemetry() after model calls?
Are hooks executing successfully? (check logs)
Is the agent_id correct?

High Memory Usage

For large telemetry datasets:

Use summary queries instead of loading all records
Filter by time range
Consider archiving old telemetry

Overview​

Basic Agent Tracking​

Registering an Agent​

Updating Agent Status​

Using Hooks for Async Tracking​

Telemetry Collection​

Recording Telemetry​

Parsing Telemetry from API Responses​

Tracking Tool Execution​

Querying Agent Data​

Get Agent Status​

Get Telemetry Data​

CLI Usage​

View Agent Status​

List Agents​

View Telemetry​

Filtering and Querying Strategies​

Filter by Status​

Filter by Plan​

Filter by Time Range​

Aggregate Telemetry​

Log Management​

Using LogManager​

Integration with Workflows​

Automatic Agent Tracking​

Manual Integration​

Error Handling​

Handling Monitoring Errors​

Graceful Degradation​

Best Practices​

1. Always Register Agents​

2. Update Status Regularly​

3. Record Telemetry After Model Calls​

4. Use Hooks for Custom Logic​

5. Filter Queries Efficiently​

Common Patterns​

Pattern 1: Track Agent with Parent​

Pattern 2: Track Plan Execution​

Pattern 3: Monitor Tool Usage​

Troubleshooting​

Database Not Found​

Missing Telemetry​

High Memory Usage​

Overview

Basic Agent Tracking

Registering an Agent

Updating Agent Status

Using Hooks for Async Tracking

Telemetry Collection

Recording Telemetry

Parsing Telemetry from API Responses

Tracking Tool Execution

Querying Agent Data

Get Agent Status

Get Telemetry Data

CLI Usage

View Agent Status

List Agents

View Telemetry

Filtering and Querying Strategies

Filter by Status

Filter by Plan

Filter by Time Range

Aggregate Telemetry

Log Management

Using LogManager

Integration with Workflows

Automatic Agent Tracking

Manual Integration

Error Handling

Handling Monitoring Errors

Graceful Degradation

Best Practices

1. Always Register Agents

2. Update Status Regularly

3. Record Telemetry After Model Calls

4. Use Hooks for Custom Logic

5. Filter Queries Efficiently

Common Patterns

Pattern 1: Track Agent with Parent

Pattern 2: Track Plan Execution

Pattern 3: Monitor Tool Usage

Troubleshooting

Database Not Found

Missing Telemetry

High Memory Usage