diff --git a/README.md b/README.md index 2c86517..a79ce4b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # ctlog-uptime-exporter +![grafana dashboard](grafana.png) + A Prometheus exporter for [Certificate Transparency](https://certificate.transparency.dev/) log uptime data published by Google at: ``` diff --git a/dashboard.json b/dashboard.json new file mode 100644 index 0000000..4e1c628 --- /dev/null +++ b/dashboard.json @@ -0,0 +1,549 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + } + ], + "title": "CT Log Uptime", + "uid": "ctlog-uptime", + "description": "Certificate Transparency log 24h uptime sourced from gstatic.com via ctlog-uptime-exporter", + "tags": ["certificate-transparency", "ct", "uptime"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "5m", + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": {}, + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "pluginId": "prometheus", + "label": "Datasource", + "hide": 0, + "current": {}, + "options": [] + }, + { + "name": "log_url", + "type": "query", + "label": "Log URL", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(ct_log_uptime_ratio, log_url)", + "query": { + "query": "label_values(ct_log_uptime_ratio, log_url)", + "refId": "StandardVariableQuery" + }, + "multi": true, + "includeAll": true, + "allValue": ".+", + "current": {}, + "refresh": 2, + "sort": 1, + "hide": 0 + }, + { + "name": "endpoint", + "type": "query", + "label": "Endpoint", + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(ct_log_uptime_ratio{log_url=~\"$log_url\"}, endpoint)", + "query": { + "query": "label_values(ct_log_uptime_ratio{log_url=~\"$log_url\"}, endpoint)", + "refId": "StandardVariableQuery" + }, + "multi": true, + "includeAll": true, + "allValue": ".+", + "current": {}, + "refresh": 2, + "sort": 1, + "hide": 0 + }, + { + "name": "topN", + "type": "custom", + "label": "Top N", + "query": "5,10,25,50", + "current": { + "text": "10", + "value": "10", + "selected": true + }, + "options": [ + {"selected": false, "text": "5", "value": "5"}, + {"selected": true, "text": "10", "value": "10"}, + {"selected": false, "text": "25", "value": "25"}, + {"selected": false, "text": "50", "value": "50"} + ], + "hide": 0 + } + ] + }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "CT Logs", + "description": "Number of unique CT log URLs currently tracked", + "gridPos": {"x": 0, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "none", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "noValue": "0" + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "count(count by (log_url) (ct_log_uptime_ratio))", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 2, + "type": "stat", + "title": "Endpoint Types", + "description": "Number of distinct endpoint operation types tracked", + "gridPos": {"x": 4, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "none", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "noValue": "0" + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "count(count by (endpoint) (ct_log_uptime_ratio))", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 3, + "type": "stat", + "title": "Average Uptime", + "description": "Mean 24h uptime ratio across all tracked log/endpoint pairs", + "gridPos": {"x": 8, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0.95}, + {"color": "green", "value": 0.99} + ] + } + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "avg(ct_log_uptime_ratio)", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 4, + "type": "stat", + "title": "Below 100% Uptime", + "description": "Number of log/endpoint pairs with uptime below 100% right now", + "gridPos": {"x": 12, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + } + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "count(ct_log_uptime_ratio < 1) or vector(0)", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 5, + "type": "stat", + "title": "Fetch Status", + "description": "Whether the last CSV fetch from gstatic.com succeeded", + "gridPos": {"x": 16, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "background", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center", + "mappings": [ + {"type": "value", "options": {"0": {"text": "FAILED", "color": "red"}, + "1": {"text": "OK", "color": "green"}}} + ] + }, + "fieldConfig": { + "defaults": { + "unit": "short", + "noValue": "-", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "green", "value": 1} + ] + } + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "ct_log_uptime_fetch_success", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 6, + "type": "stat", + "title": "Last Fetch", + "description": "Timestamp of the most recent CSV fetch attempt", + "gridPos": {"x": 20, "y": 0, "w": 4, "h": 4}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "colorMode": "none", + "graphMode": "none", + "textMode": "value", + "justifyMode": "center" + }, + "fieldConfig": { + "defaults": { + "unit": "dateTimeFromNow", + "noValue": "-" + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "ct_log_uptime_fetch_timestamp_seconds * 1000", + "instant": true, + "refId": "A" + } + ] + }, + { + "id": 7, + "type": "row", + "title": "Uptime Over Time", + "gridPos": {"x": 0, "y": 4, "w": 24, "h": 1}, + "collapsed": false, + "panels": [] + }, + { + "id": 8, + "type": "timeseries", + "title": "24h Rolling Uptime", + "description": "How the rolling 24h uptime ratio has changed over the selected time range. Each series is one log/endpoint pair.", + "gridPos": {"x": 0, "y": 5, "w": 24, "h": 9}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "tooltip": {"mode": "multi", "sort": "asc"}, + "legend": { + "displayMode": "table", + "placement": "bottom", + "calcs": ["lastNotNull", "min"] + } + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "custom": { + "lineWidth": 1, + "fillOpacity": 0, + "spanNulls": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0.95}, + {"color": "green", "value": 0.99} + ] + }, + "color": {"mode": "palette-classic"} + }, + "overrides": [] + }, + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "ct_log_uptime_ratio{log_url=~\"$log_url\", endpoint=~\"$endpoint\"}", + "legendFormat": "{{log_url}} / {{endpoint}}", + "refId": "A" + } + ] + }, + { + "id": 9, + "type": "row", + "title": "Least Available", + "gridPos": {"x": 0, "y": 14, "w": 24, "h": 1}, + "collapsed": false, + "panels": [] + }, + { + "id": 10, + "type": "table", + "title": "Top $topN Least Available Log/Endpoint Pairs", + "description": "Current snapshot of the $topN log/endpoint pairs with the lowest 24h uptime ratio, filtered by the selected Log URL and Endpoint variables.", + "gridPos": {"x": 0, "y": 15, "w": 12, "h": 10}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "sortBy": [{"displayName": "Uptime", "desc": false}], + "footer": {"show": false} + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "displayMode": "auto" + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "log_url"}, + "properties": [ + {"id": "displayName", "value": "Log URL"}, + {"id": "custom.width", "value": 420} + ] + }, + { + "matcher": {"id": "byName", "options": "endpoint"}, + "properties": [ + {"id": "displayName", "value": "Endpoint"}, + {"id": "custom.width", "value": 160} + ] + }, + { + "matcher": {"id": "byName", "options": "Value #A"}, + "properties": [ + {"id": "displayName", "value": "Uptime"}, + {"id": "unit", "value": "percentunit"}, + {"id": "custom.width", "value": 120}, + {"id": "custom.displayMode", "value": "color-background"}, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0.95}, + {"color": "green", "value": 0.99} + ] + } + } + ] + } + ] + }, + "transformations": [ + {"id": "merge", "options": {}}, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "instance": true, + "job": true + }, + "renameByName": {} + } + } + ], + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "bottomk($topN, ct_log_uptime_ratio{log_url=~\"$log_url\", endpoint=~\"$endpoint\"})", + "instant": true, + "legendFormat": "{{log_url}} / {{endpoint}}", + "refId": "A" + } + ] + }, + { + "id": 11, + "type": "table", + "title": "Logs with any endpoint below 100%", + "description": "CT logs where at least one endpoint has a 24h uptime below 100%, showing the worst (minimum) uptime across all endpoints for that log.", + "gridPos": {"x": 12, "y": 15, "w": 12, "h": 10}, + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "options": { + "sortBy": [{"displayName": "Uptime", "desc": false}], + "footer": {"show": false} + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "displayMode": "auto" + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "log_url"}, + "properties": [ + {"id": "displayName", "value": "Log URL"}, + {"id": "custom.width", "value": 420} + ] + }, + { + "matcher": {"id": "byName", "options": "Value #A"}, + "properties": [ + {"id": "displayName", "value": "Uptime"}, + {"id": "unit", "value": "percentunit"}, + {"id": "custom.width", "value": 120}, + {"id": "custom.displayMode", "value": "color-background"}, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0.95}, + {"color": "green", "value": 0.99} + ] + } + } + ] + } + ] + }, + "transformations": [ + {"id": "merge", "options": {}}, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "endpoint": true, + "instance": true, + "job": true + }, + "renameByName": {} + } + } + ], + "targets": [ + { + "datasource": {"type": "prometheus", "uid": "${datasource}"}, + "expr": "sort(min by (log_url) (ct_log_uptime_ratio{log_url=~\"$log_url\"}) < 1)", + "instant": true, + "legendFormat": "{{log_url}}", + "refId": "A" + } + ] + } + ] +} diff --git a/grafana.png b/grafana.png new file mode 100644 index 0000000..9f3285c Binary files /dev/null and b/grafana.png differ