Spaces:
Sleeping
Sleeping
Melika Kheirieh commited on
Commit ·
2e3e9b8
1
Parent(s): d367a93
feat(metrics): instrument per-stage and pipeline_total latency; count pipeline_runs_total (ok/error/ambiguous)
Browse files- Makefile +21 -0
- docker-compose.prom.yml +13 -0
- prometheus/prometheus.yml +14 -0
- prometheus/rules.yml +49 -0
- scripts/smoke_metrics.sh +31 -0
Makefile
CHANGED
|
@@ -108,3 +108,24 @@ clean: ## Remove Python caches
|
|
| 108 |
.PHONY: clean-all
|
| 109 |
clean-all: clean ## Remove build artifacts and coverage
|
| 110 |
rm -rf dist build .coverage *.egg-info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
.PHONY: clean-all
|
| 109 |
clean-all: clean ## Remove build artifacts and coverage
|
| 110 |
rm -rf dist build .coverage *.egg-info
|
| 111 |
+
|
| 112 |
+
# ---------- Metrics ----------
|
| 113 |
+
.PHONY: prom-up prom-check smoke
|
| 114 |
+
|
| 115 |
+
prom-up:
|
| 116 |
+
docker compose -f docker-compose.prom.yml up -d
|
| 117 |
+
|
| 118 |
+
prom-check:
|
| 119 |
+
@if command -v promtool >/dev/null 2>&1; then \
|
| 120 |
+
echo "🔍 Running promtool locally..."; \
|
| 121 |
+
promtool check rules prometheus/rules.yml && promtool check config prometheus/prometheus.yml; \
|
| 122 |
+
else \
|
| 123 |
+
echo "⚠️ promtool not found, running via Docker..."; \
|
| 124 |
+
docker run --rm -v $$(pwd)/prometheus:/etc/prometheus prom/prometheus \
|
| 125 |
+
promtool check rules /etc/prometheus/rules.yml && \
|
| 126 |
+
docker run --rm -v $$(pwd)/prometheus:/etc/prometheus prom/prometheus \
|
| 127 |
+
promtool check config /etc/prometheus/prometheus.yml; \
|
| 128 |
+
fi
|
| 129 |
+
|
| 130 |
+
smoke:
|
| 131 |
+
./scripts/smoke_metrics.sh
|
docker-compose.prom.yml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
services:
|
| 3 |
+
prometheus:
|
| 4 |
+
image: prom/prometheus:v2.55.0
|
| 5 |
+
container_name: nl2sql-prom
|
| 6 |
+
command:
|
| 7 |
+
- --config.file=/etc/prometheus/prometheus.yml
|
| 8 |
+
volumes:
|
| 9 |
+
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
| 10 |
+
- ./prometheus/rules.yml:/etc/prometheus/rules.yml:ro
|
| 11 |
+
ports:
|
| 12 |
+
- "9090:9090"
|
| 13 |
+
restart: unless-stopped
|
prometheus/prometheus.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
scrape_interval: 15s
|
| 3 |
+
evaluation_interval: 15s
|
| 4 |
+
|
| 5 |
+
# Load recording and alerting rules from this file
|
| 6 |
+
rule_files:
|
| 7 |
+
- rules.yml # relative to this directory (prometheus/)
|
| 8 |
+
|
| 9 |
+
scrape_configs:
|
| 10 |
+
- job_name: "nl2sql-app"
|
| 11 |
+
static_configs:
|
| 12 |
+
- targets: ["host.docker.internal:8000"] # use this if Prometheus runs inside Docker
|
| 13 |
+
labels:
|
| 14 |
+
service: "nl2sql"
|
prometheus/rules.yml
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
groups:
|
| 2 |
+
- name: nl2sql_latency
|
| 3 |
+
rules:
|
| 4 |
+
# p95 latency per stage (5-minute window)
|
| 5 |
+
- record: nl2sql:stage_p95_ms
|
| 6 |
+
expr: |
|
| 7 |
+
histogram_quantile(
|
| 8 |
+
0.95,
|
| 9 |
+
sum(rate(stage_duration_ms_bucket[5m])) by (le, stage)
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# pipeline success ratio (5-minute rolling window)
|
| 13 |
+
- record: nl2sql:pipeline_success_ratio
|
| 14 |
+
expr: |
|
| 15 |
+
sum(rate(pipeline_runs_total{status="ok"}[5m]))
|
| 16 |
+
/
|
| 17 |
+
sum(rate(pipeline_runs_total[5m]))
|
| 18 |
+
|
| 19 |
+
- name: nl2sql_alerts
|
| 20 |
+
rules:
|
| 21 |
+
# Alert: success ratio below 90% for 10 minutes
|
| 22 |
+
- alert: PipelineLowSuccessRatio
|
| 23 |
+
expr: nl2sql:pipeline_success_ratio < 0.9
|
| 24 |
+
for: 10m
|
| 25 |
+
labels:
|
| 26 |
+
severity: warning
|
| 27 |
+
annotations:
|
| 28 |
+
summary: "Pipeline success ratio dropped"
|
| 29 |
+
description: "Success ratio < 90% over the past 10 minutes"
|
| 30 |
+
|
| 31 |
+
# Alert: high generator p95 latency (>1.5s for 5 minutes)
|
| 32 |
+
- alert: GeneratorLatencyHigh
|
| 33 |
+
expr: nl2sql:stage_p95_ms{stage="generator"} > 1500
|
| 34 |
+
for: 5m
|
| 35 |
+
labels:
|
| 36 |
+
severity: warning
|
| 37 |
+
annotations:
|
| 38 |
+
summary: "Generator p95 latency high"
|
| 39 |
+
description: "Generator p95 > 1.5s for 5 minutes"
|
| 40 |
+
|
| 41 |
+
# Alert: unusual spike in Safety blocks
|
| 42 |
+
- alert: SafetyBlocksSpike
|
| 43 |
+
expr: rate(safety_blocks_total[5m]) > 0.5
|
| 44 |
+
for: 5m
|
| 45 |
+
labels:
|
| 46 |
+
severity: info
|
| 47 |
+
annotations:
|
| 48 |
+
summary: "Unusual Safety block rate"
|
| 49 |
+
description: "Safety blocks rate > 0.5 per minute (check inputs or safety rules)"
|
scripts/smoke_metrics.sh
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
BASE=${BASE:-http://localhost:8000}
|
| 5 |
+
API="$BASE/api/v1"
|
| 6 |
+
|
| 7 |
+
# Send a few successful queries to populate basic metrics
|
| 8 |
+
for q in \
|
| 9 |
+
"List all artists" \
|
| 10 |
+
"Top 5 albums by sales" \
|
| 11 |
+
"Count customers"
|
| 12 |
+
do
|
| 13 |
+
curl -s -X POST "$API/nl2sql" \
|
| 14 |
+
-H 'Content-Type: application/json' \
|
| 15 |
+
-d "{\"query\":\"$q\"}" >/dev/null || true
|
| 16 |
+
done
|
| 17 |
+
|
| 18 |
+
# Send queries that trigger safety and verifier checks
|
| 19 |
+
curl -s -X POST "$API/nl2sql" \
|
| 20 |
+
-H 'Content-Type: application/json' \
|
| 21 |
+
-d '{"query":"DELETE FROM users;"}' >/dev/null || true
|
| 22 |
+
|
| 23 |
+
curl -s -X POST "$API/nl2sql" \
|
| 24 |
+
-H 'Content-Type: application/json' \
|
| 25 |
+
-d '{"query":"SELECT COUNT(*), country FROM customers;"}' >/dev/null || true
|
| 26 |
+
|
| 27 |
+
# Print a snapshot of key Prometheus metrics
|
| 28 |
+
echo -e "\n--- Metrics snapshot ---"
|
| 29 |
+
curl -s "$BASE/metrics" | grep -E \
|
| 30 |
+
'stage_duration_ms_(sum|count|bucket)|pipeline_runs_total|safety_(checks|blocks)_total|verifier_(checks|failures)_total' \
|
| 31 |
+
|| true
|