Melika Kheirieh commited on
Commit
2e3e9b8
·
1 Parent(s): d367a93

feat(metrics): instrument per-stage and pipeline_total latency; count pipeline_runs_total (ok/error/ambiguous)

Browse files
Makefile CHANGED
@@ -108,3 +108,24 @@ clean: ## Remove Python caches
108
  .PHONY: clean-all
109
  clean-all: clean ## Remove build artifacts and coverage
110
  rm -rf dist build .coverage *.egg-info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  .PHONY: clean-all
109
  clean-all: clean ## Remove build artifacts and coverage
110
  rm -rf dist build .coverage *.egg-info
111
+
112
+ # ---------- Metrics ----------
113
+ .PHONY: prom-up prom-check smoke
114
+
115
+ prom-up:
116
+ docker compose -f docker-compose.prom.yml up -d
117
+
118
+ prom-check:
119
+ @if command -v promtool >/dev/null 2>&1; then \
120
+ echo "🔍 Running promtool locally..."; \
121
+ promtool check rules prometheus/rules.yml && promtool check config prometheus/prometheus.yml; \
122
+ else \
123
+ echo "⚠️ promtool not found, running via Docker..."; \
124
+ docker run --rm -v $$(pwd)/prometheus:/etc/prometheus prom/prometheus \
125
+ promtool check rules /etc/prometheus/rules.yml && \
126
+ docker run --rm -v $$(pwd)/prometheus:/etc/prometheus prom/prometheus \
127
+ promtool check config /etc/prometheus/prometheus.yml; \
128
+ fi
129
+
130
+ smoke:
131
+ ./scripts/smoke_metrics.sh
docker-compose.prom.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.8"
2
+ services:
3
+ prometheus:
4
+ image: prom/prometheus:v2.55.0
5
+ container_name: nl2sql-prom
6
+ command:
7
+ - --config.file=/etc/prometheus/prometheus.yml
8
+ volumes:
9
+ - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
10
+ - ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
11
+ ports:
12
+ - "9090:9090"
13
+ restart: unless-stopped
prometheus/prometheus.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+
5
+ # Load recording and alerting rules from this file
6
+ rule_files:
7
+ - rules.yml # relative to this directory (prometheus/)
8
+
9
+ scrape_configs:
10
+ - job_name: "nl2sql-app"
11
+ static_configs:
12
+ - targets: ["host.docker.internal:8000"] # use this if Prometheus runs inside Docker
13
+ labels:
14
+ service: "nl2sql"
prometheus/rules.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groups:
2
+ - name: nl2sql_latency
3
+ rules:
4
+ # p95 latency per stage (5-minute window)
5
+ - record: nl2sql:stage_p95_ms
6
+ expr: |
7
+ histogram_quantile(
8
+ 0.95,
9
+ sum(rate(stage_duration_ms_bucket[5m])) by (le, stage)
10
+ )
11
+
12
+ # pipeline success ratio (5-minute rolling window)
13
+ - record: nl2sql:pipeline_success_ratio
14
+ expr: |
15
+ sum(rate(pipeline_runs_total{status="ok"}[5m]))
16
+ /
17
+ sum(rate(pipeline_runs_total[5m]))
18
+
19
+ - name: nl2sql_alerts
20
+ rules:
21
+ # Alert: success ratio below 90% for 10 minutes
22
+ - alert: PipelineLowSuccessRatio
23
+ expr: nl2sql:pipeline_success_ratio < 0.9
24
+ for: 10m
25
+ labels:
26
+ severity: warning
27
+ annotations:
28
+ summary: "Pipeline success ratio dropped"
29
+ description: "Success ratio < 90% over the past 10 minutes"
30
+
31
+ # Alert: high generator p95 latency (>1.5s for 5 minutes)
32
+ - alert: GeneratorLatencyHigh
33
+ expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
34
+ for: 5m
35
+ labels:
36
+ severity: warning
37
+ annotations:
38
+ summary: "Generator p95 latency high"
39
+ description: "Generator p95 > 1.5s for 5 minutes"
40
+
41
+ # Alert: unusual spike in Safety blocks
42
+ - alert: SafetyBlocksSpike
43
+ expr: rate(safety_blocks_total[5m]) > 0.5
44
+ for: 5m
45
+ labels:
46
+ severity: info
47
+ annotations:
48
+ summary: "Unusual Safety block rate"
49
+ description: "Safety blocks rate > 0.5 per minute (check inputs or safety rules)"
scripts/smoke_metrics.sh ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ BASE=${BASE:-http://localhost:8000}
5
+ API="$BASE/api/v1"
6
+
7
+ # Send a few successful queries to populate basic metrics
8
+ for q in \
9
+ "List all artists" \
10
+ "Top 5 albums by sales" \
11
+ "Count customers"
12
+ do
13
+ curl -s -X POST "$API/nl2sql" \
14
+ -H 'Content-Type: application/json' \
15
+ -d "{\"query\":\"$q\"}" >/dev/null || true
16
+ done
17
+
18
+ # Send queries that trigger safety and verifier checks
19
+ curl -s -X POST "$API/nl2sql" \
20
+ -H 'Content-Type: application/json' \
21
+ -d '{"query":"DELETE FROM users;"}' >/dev/null || true
22
+
23
+ curl -s -X POST "$API/nl2sql" \
24
+ -H 'Content-Type: application/json' \
25
+ -d '{"query":"SELECT COUNT(*), country FROM customers;"}' >/dev/null || true
26
+
27
+ # Print a snapshot of key Prometheus metrics
28
+ echo -e "\n--- Metrics snapshot ---"
29
+ curl -s "$BASE/metrics" | grep -E \
30
+ 'stage_duration_ms_(sum|count|bucket)|pipeline_runs_total|safety_(checks|blocks)_total|verifier_(checks|failures)_total' \
31
+ || true