6647f95be4
Wire-format and metric overhaul. Both file and UDP ingest now share one
versioned ParseLine that dispatches on the v<N>\t prefix; v1 stays
unchanged, v2 adds $bytes_sent (replacing $body_bytes_sent),
$request_length, $upstream_response_time, and $upstream_status. File
ingest gains the same versioning, and the legacy positional file format
is removed (no live deployments).
Prometheus exposition is rewritten:
- nginx_http_bytes_sent and nginx_http_request_duration_seconds gain
a source_tag label.
- nginx_http_requests_by_source_total gains status_class.
- New v2-only metrics: nginx_http_request_bytes,
nginx_http_upstream_duration_seconds,
nginx_http_upstream_requests_total{status_class}.
- Dropped nginx_http_response_body_bytes_by_source (subsumed by the
dual-labeled bytes_sent metric).
Adds 'make fixstyle' (gofmt -w) and clears all golangci-lint findings
across the repo (errcheck, S1001, ST1005, unused).
Docs in design.md FR-2/FR-8 and user-guide.md are rewritten to present
v2 as the recommended log format.
404 lines
13 KiB
Go
404 lines
13 KiB
Go
package main
|
|
|
|
import (
|
|
"bufio"
|
|
"fmt"
|
|
"net/http"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// Body-size histogram bucket upper bounds in bytes.
|
|
const promNumBodyBounds = 7
|
|
|
|
var promBodyBounds = [promNumBodyBounds]int64{256, 1024, 4096, 16384, 65536, 262144, 1048576}
|
|
|
|
// Duration histogram bucket upper bounds in seconds (Prometheus defaults).
|
|
const promNumTimeBounds = 11
|
|
|
|
var promTimeBounds = [promNumTimeBounds]float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}
|
|
|
|
const promCounterCap = 250_000 // safety cap on {host,method,status} counter entries
|
|
|
|
// promCounterKey is the label set for the per-request counter.
|
|
type promCounterKey struct {
|
|
Host string
|
|
Method string
|
|
Status string
|
|
}
|
|
|
|
// hostSourceKey labels histograms by {host, source_tag}.
|
|
type hostSourceKey struct {
|
|
Host string
|
|
SourceTag string
|
|
}
|
|
|
|
// sourceClassKey labels the source-tag rollup counter.
|
|
type sourceClassKey struct {
|
|
SourceTag string
|
|
StatusClass string
|
|
}
|
|
|
|
// upstreamKey labels the upstream-only request counter.
|
|
type upstreamKey struct {
|
|
Host string
|
|
SourceTag string
|
|
StatusClass string // class of $upstream_status
|
|
}
|
|
|
|
// promBodyEntry holds one body-size histogram (one label-set worth).
|
|
type promBodyEntry struct {
|
|
buckets [promNumBodyBounds + 1]int64 // indices 0..N-1: le=bound[i]; index N: le=+Inf
|
|
sum int64
|
|
}
|
|
|
|
// promTimeEntry holds one duration histogram (one label-set worth).
|
|
type promTimeEntry struct {
|
|
buckets [promNumTimeBounds + 1]int64
|
|
sum float64
|
|
}
|
|
|
|
// PromStore accumulates Prometheus metrics ingested from log records.
|
|
//
|
|
// Ingest must be called from exactly one goroutine (the store's Run goroutine).
|
|
// ServeHTTP may be called from any number of goroutines concurrently.
|
|
type PromStore struct {
|
|
mu sync.Mutex
|
|
counters map[promCounterKey]int64
|
|
bytesSent map[hostSourceKey]*promBodyEntry
|
|
requestDuration map[hostSourceKey]*promTimeEntry
|
|
requestBytes map[hostSourceKey]*promBodyEntry // v2 only
|
|
upstreamDuration map[hostSourceKey]*promTimeEntry // v2 only
|
|
upstreamCounters map[upstreamKey]int64 // v2 only
|
|
sourceCounters map[sourceClassKey]int64
|
|
|
|
udpMu sync.Mutex
|
|
udpPacketsReceived int64
|
|
udpLoglinesSuccess int64
|
|
udpLoglinesConsumed int64
|
|
}
|
|
|
|
// NewPromStore returns an empty PromStore ready for use.
|
|
func NewPromStore() *PromStore {
|
|
return &PromStore{
|
|
counters: make(map[promCounterKey]int64, 1024),
|
|
bytesSent: make(map[hostSourceKey]*promBodyEntry, 64),
|
|
requestDuration: make(map[hostSourceKey]*promTimeEntry, 64),
|
|
requestBytes: make(map[hostSourceKey]*promBodyEntry, 64),
|
|
upstreamDuration: make(map[hostSourceKey]*promTimeEntry, 64),
|
|
upstreamCounters: make(map[upstreamKey]int64, 64),
|
|
sourceCounters: make(map[sourceClassKey]int64, 32),
|
|
}
|
|
}
|
|
|
|
// Ingest records one log record into the Prometheus metrics.
|
|
// Must be called from a single goroutine.
|
|
func (p *PromStore) Ingest(r LogRecord) {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
|
|
hsk := hostSourceKey{Host: r.Website, SourceTag: r.SourceTag}
|
|
|
|
// nginx_http_requests_total{host,method,status} — capped.
|
|
ck := promCounterKey{Host: r.Website, Method: r.Method, Status: r.Status}
|
|
if _, ok := p.counters[ck]; ok {
|
|
p.counters[ck]++
|
|
} else if len(p.counters) < promCounterCap {
|
|
p.counters[ck] = 1
|
|
}
|
|
|
|
observeBody(p.bytesSent, hsk, r.BytesSent)
|
|
observeTime(p.requestDuration, hsk, r.RequestTime)
|
|
if r.RequestLength > 0 {
|
|
observeBody(p.requestBytes, hsk, r.RequestLength)
|
|
}
|
|
|
|
p.sourceCounters[sourceClassKey{
|
|
SourceTag: r.SourceTag,
|
|
StatusClass: statusClass(r.Status),
|
|
}]++
|
|
|
|
if r.HasUpstream {
|
|
observeTime(p.upstreamDuration, hsk, r.UpstreamResponseTime)
|
|
p.upstreamCounters[upstreamKey{
|
|
Host: r.Website,
|
|
SourceTag: r.SourceTag,
|
|
StatusClass: statusClass(r.UpstreamStatus),
|
|
}]++
|
|
}
|
|
}
|
|
|
|
// IncUDPPacket, IncUDPSuccess, IncUDPConsumed bump UDP-ingest counters from
|
|
// the listener goroutine.
|
|
func (p *PromStore) IncUDPPacket() { p.udpMu.Lock(); p.udpPacketsReceived++; p.udpMu.Unlock() }
|
|
func (p *PromStore) IncUDPSuccess() { p.udpMu.Lock(); p.udpLoglinesSuccess++; p.udpMu.Unlock() }
|
|
func (p *PromStore) IncUDPConsumed() { p.udpMu.Lock(); p.udpLoglinesConsumed++; p.udpMu.Unlock() }
|
|
|
|
// statusClass folds an HTTP status code into 2xx/3xx/4xx/5xx, with anything
|
|
// else falling to "other" (including empty input).
|
|
func statusClass(status string) string {
|
|
if status == "" {
|
|
return "other"
|
|
}
|
|
switch status[0] {
|
|
case '2':
|
|
return "2xx"
|
|
case '3':
|
|
return "3xx"
|
|
case '4':
|
|
return "4xx"
|
|
case '5':
|
|
return "5xx"
|
|
default:
|
|
return "other"
|
|
}
|
|
}
|
|
|
|
func observeBody(m map[hostSourceKey]*promBodyEntry, key hostSourceKey, bytes int64) {
|
|
e, ok := m[key]
|
|
if !ok {
|
|
e = &promBodyEntry{}
|
|
m[key] = e
|
|
}
|
|
for i, bound := range promBodyBounds {
|
|
if bytes <= bound {
|
|
e.buckets[i]++
|
|
}
|
|
}
|
|
e.buckets[promNumBodyBounds]++
|
|
e.sum += bytes
|
|
}
|
|
|
|
func observeTime(m map[hostSourceKey]*promTimeEntry, key hostSourceKey, seconds float64) {
|
|
e, ok := m[key]
|
|
if !ok {
|
|
e = &promTimeEntry{}
|
|
m[key] = e
|
|
}
|
|
for i, bound := range promTimeBounds {
|
|
if seconds <= bound {
|
|
e.buckets[i]++
|
|
}
|
|
}
|
|
e.buckets[promNumTimeBounds]++
|
|
e.sum += seconds
|
|
}
|
|
|
|
// ServeHTTP renders all metrics in the Prometheus text exposition format (0.0.4).
|
|
func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
|
|
type counterSnap struct {
|
|
k promCounterKey
|
|
v int64
|
|
}
|
|
type bodySnap struct {
|
|
k hostSourceKey
|
|
e promBodyEntry
|
|
}
|
|
type timeSnap struct {
|
|
k hostSourceKey
|
|
e promTimeEntry
|
|
}
|
|
type upstreamCounterSnap struct {
|
|
k upstreamKey
|
|
v int64
|
|
}
|
|
type sourceCounterSnap struct {
|
|
k sourceClassKey
|
|
v int64
|
|
}
|
|
|
|
p.mu.Lock()
|
|
|
|
counters := make([]counterSnap, 0, len(p.counters))
|
|
for k, v := range p.counters {
|
|
counters = append(counters, counterSnap{k, v})
|
|
}
|
|
bytesSnaps := make([]bodySnap, 0, len(p.bytesSent))
|
|
for k, e := range p.bytesSent {
|
|
bytesSnaps = append(bytesSnaps, bodySnap{k, *e})
|
|
}
|
|
requestBytesSnaps := make([]bodySnap, 0, len(p.requestBytes))
|
|
for k, e := range p.requestBytes {
|
|
requestBytesSnaps = append(requestBytesSnaps, bodySnap{k, *e})
|
|
}
|
|
requestDurationSnaps := make([]timeSnap, 0, len(p.requestDuration))
|
|
for k, e := range p.requestDuration {
|
|
requestDurationSnaps = append(requestDurationSnaps, timeSnap{k, *e})
|
|
}
|
|
upstreamDurationSnaps := make([]timeSnap, 0, len(p.upstreamDuration))
|
|
for k, e := range p.upstreamDuration {
|
|
upstreamDurationSnaps = append(upstreamDurationSnaps, timeSnap{k, *e})
|
|
}
|
|
upstreamCounters := make([]upstreamCounterSnap, 0, len(p.upstreamCounters))
|
|
for k, v := range p.upstreamCounters {
|
|
upstreamCounters = append(upstreamCounters, upstreamCounterSnap{k, v})
|
|
}
|
|
sourceCounters := make([]sourceCounterSnap, 0, len(p.sourceCounters))
|
|
for k, v := range p.sourceCounters {
|
|
sourceCounters = append(sourceCounters, sourceCounterSnap{k, v})
|
|
}
|
|
|
|
p.mu.Unlock()
|
|
|
|
p.udpMu.Lock()
|
|
udpPackets := p.udpPacketsReceived
|
|
udpSuccess := p.udpLoglinesSuccess
|
|
udpConsumed := p.udpLoglinesConsumed
|
|
p.udpMu.Unlock()
|
|
|
|
sort.Slice(counters, func(i, j int) bool {
|
|
a, b := counters[i].k, counters[j].k
|
|
if a.Host != b.Host {
|
|
return a.Host < b.Host
|
|
}
|
|
if a.Method != b.Method {
|
|
return a.Method < b.Method
|
|
}
|
|
return a.Status < b.Status
|
|
})
|
|
sortBody := func(s []bodySnap) {
|
|
sort.Slice(s, func(i, j int) bool {
|
|
a, b := s[i].k, s[j].k
|
|
if a.Host != b.Host {
|
|
return a.Host < b.Host
|
|
}
|
|
return a.SourceTag < b.SourceTag
|
|
})
|
|
}
|
|
sortTime := func(s []timeSnap) {
|
|
sort.Slice(s, func(i, j int) bool {
|
|
a, b := s[i].k, s[j].k
|
|
if a.Host != b.Host {
|
|
return a.Host < b.Host
|
|
}
|
|
return a.SourceTag < b.SourceTag
|
|
})
|
|
}
|
|
sortBody(bytesSnaps)
|
|
sortBody(requestBytesSnaps)
|
|
sortTime(requestDurationSnaps)
|
|
sortTime(upstreamDurationSnaps)
|
|
sort.Slice(upstreamCounters, func(i, j int) bool {
|
|
a, b := upstreamCounters[i].k, upstreamCounters[j].k
|
|
if a.Host != b.Host {
|
|
return a.Host < b.Host
|
|
}
|
|
if a.SourceTag != b.SourceTag {
|
|
return a.SourceTag < b.SourceTag
|
|
}
|
|
return a.StatusClass < b.StatusClass
|
|
})
|
|
sort.Slice(sourceCounters, func(i, j int) bool {
|
|
a, b := sourceCounters[i].k, sourceCounters[j].k
|
|
if a.SourceTag != b.SourceTag {
|
|
return a.SourceTag < b.SourceTag
|
|
}
|
|
return a.StatusClass < b.StatusClass
|
|
})
|
|
|
|
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
|
|
bw := bufio.NewWriterSize(w, 256*1024)
|
|
|
|
// pf, pln are short helpers so the metric block reads cleanly. Errors on a
|
|
// bufio writer wrapping http.ResponseWriter mean the client disconnected;
|
|
// there's nothing useful to do mid-write — the next call will simply no-op.
|
|
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
|
|
pln := func(s string) { _, _ = fmt.Fprintln(bw, s) }
|
|
|
|
pln("# HELP nginx_http_requests_total Total number of HTTP requests processed.")
|
|
pln("# TYPE nginx_http_requests_total counter")
|
|
for _, c := range counters {
|
|
pf("nginx_http_requests_total{host=%q,method=%q,status=%q} %d\n",
|
|
c.k.Host, c.k.Method, c.k.Status, c.v)
|
|
}
|
|
|
|
pln("# HELP nginx_http_bytes_sent HTTP response size distribution in bytes (body for v1 records, full wire bytes for v2).")
|
|
pln("# TYPE nginx_http_bytes_sent histogram")
|
|
for _, s := range bytesSnaps {
|
|
writeBodyHistogramHS(bw, "nginx_http_bytes_sent", s.k, s.e)
|
|
}
|
|
|
|
pln("# HELP nginx_http_request_bytes HTTP request size distribution in bytes (v2 emitters only).")
|
|
pln("# TYPE nginx_http_request_bytes histogram")
|
|
for _, s := range requestBytesSnaps {
|
|
writeBodyHistogramHS(bw, "nginx_http_request_bytes", s.k, s.e)
|
|
}
|
|
|
|
pln("# HELP nginx_http_request_duration_seconds HTTP request processing time in seconds.")
|
|
pln("# TYPE nginx_http_request_duration_seconds histogram")
|
|
for _, s := range requestDurationSnaps {
|
|
writeTimeHistogramHS(bw, "nginx_http_request_duration_seconds", s.k, s.e)
|
|
}
|
|
|
|
pln("# HELP nginx_http_upstream_duration_seconds Upstream response time in seconds (v2 emitters only).")
|
|
pln("# TYPE nginx_http_upstream_duration_seconds histogram")
|
|
for _, s := range upstreamDurationSnaps {
|
|
writeTimeHistogramHS(bw, "nginx_http_upstream_duration_seconds", s.k, s.e)
|
|
}
|
|
|
|
pln("# HELP nginx_http_upstream_requests_total Requests served via an upstream, by upstream-status class (v2 emitters only).")
|
|
pln("# TYPE nginx_http_upstream_requests_total counter")
|
|
for _, c := range upstreamCounters {
|
|
pf("nginx_http_upstream_requests_total{host=%q,source_tag=%q,status_class=%q} %d\n",
|
|
c.k.Host, c.k.SourceTag, c.k.StatusClass, c.v)
|
|
}
|
|
|
|
pln("# HELP nginx_http_requests_by_source_total HTTP requests rolled up by source_tag and status class.")
|
|
pln("# TYPE nginx_http_requests_by_source_total counter")
|
|
for _, c := range sourceCounters {
|
|
pf("nginx_http_requests_by_source_total{source_tag=%q,status_class=%q} %d\n",
|
|
c.k.SourceTag, c.k.StatusClass, c.v)
|
|
}
|
|
|
|
pln("# HELP logtail_udp_packets_received_total Datagrams read from the UDP socket.")
|
|
pln("# TYPE logtail_udp_packets_received_total counter")
|
|
pf("logtail_udp_packets_received_total %d\n", udpPackets)
|
|
pln("# HELP logtail_udp_loglines_success_total UDP loglines that parsed successfully.")
|
|
pln("# TYPE logtail_udp_loglines_success_total counter")
|
|
pf("logtail_udp_loglines_success_total %d\n", udpSuccess)
|
|
pln("# HELP logtail_udp_loglines_consumed_total UDP loglines forwarded to the store (not dropped).")
|
|
pln("# TYPE logtail_udp_loglines_consumed_total counter")
|
|
pf("logtail_udp_loglines_consumed_total %d\n", udpConsumed)
|
|
|
|
_ = bw.Flush()
|
|
}
|
|
|
|
func writeBodyHistogramHS(bw *bufio.Writer, metric string, k hostSourceKey, e promBodyEntry) {
|
|
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
|
|
for i, bound := range promBodyBounds {
|
|
pf("%s_bucket{host=%q,source_tag=%q,le=\"%d\"} %d\n",
|
|
metric, k.Host, k.SourceTag, bound, e.buckets[i])
|
|
}
|
|
pf("%s_bucket{host=%q,source_tag=%q,le=\"+Inf\"} %d\n",
|
|
metric, k.Host, k.SourceTag, e.buckets[promNumBodyBounds])
|
|
pf("%s_count{host=%q,source_tag=%q} %d\n",
|
|
metric, k.Host, k.SourceTag, e.buckets[promNumBodyBounds])
|
|
pf("%s_sum{host=%q,source_tag=%q} %d\n",
|
|
metric, k.Host, k.SourceTag, e.sum)
|
|
}
|
|
|
|
func writeTimeHistogramHS(bw *bufio.Writer, metric string, k hostSourceKey, e promTimeEntry) {
|
|
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
|
|
for i, bound := range promTimeBounds {
|
|
pf("%s_bucket{host=%q,source_tag=%q,le=%q} %d\n",
|
|
metric, k.Host, k.SourceTag, formatFloat(bound), e.buckets[i])
|
|
}
|
|
pf("%s_bucket{host=%q,source_tag=%q,le=\"+Inf\"} %d\n",
|
|
metric, k.Host, k.SourceTag, e.buckets[promNumTimeBounds])
|
|
pf("%s_count{host=%q,source_tag=%q} %d\n",
|
|
metric, k.Host, k.SourceTag, e.buckets[promNumTimeBounds])
|
|
pf("%s_sum{host=%q,source_tag=%q} %g\n",
|
|
metric, k.Host, k.SourceTag, e.sum)
|
|
}
|
|
|
|
// formatFloat renders a float64 bucket bound without trailing zeros but always
|
|
// with at least one decimal place, matching Prometheus convention (e.g. "0.5", "10").
|
|
func formatFloat(f float64) string {
|
|
s := fmt.Sprintf("%g", f)
|
|
if !strings.Contains(s, ".") && !strings.Contains(s, "e") {
|
|
s += ".0"
|
|
}
|
|
return s
|
|
}
|