RELEASE 1.0.1: v2 log format, source_tag-labeled metrics, lint cleanup

Wire-format and metric overhaul. Both file and UDP ingest now share one
versioned ParseLine that dispatches on the v<N>\t prefix; v1 stays
unchanged, v2 adds $bytes_sent (replacing $body_bytes_sent),
$request_length, $upstream_response_time, and $upstream_status. File
ingest gains the same versioning, and the legacy positional file format
is removed (no live deployments).

Prometheus exposition is rewritten:

  - nginx_http_bytes_sent and nginx_http_request_duration_seconds gain
    a source_tag label.
  - nginx_http_requests_by_source_total gains status_class.
  - New v2-only metrics: nginx_http_request_bytes,
    nginx_http_upstream_duration_seconds,
    nginx_http_upstream_requests_total{status_class}.
  - Dropped nginx_http_response_body_bytes_by_source (subsumed by the
    dual-labeled bytes_sent metric).

Adds 'make fixstyle' (gofmt -w) and clears all golangci-lint findings
across the repo (errcheck, S1001, ST1005, unused).

Docs in design.md FR-2/FR-8 and user-guide.md are rewritten to present
v2 as the recommended log format.
This commit is contained in:
2026-05-01 15:40:53 +02:00
parent d1a21a7a62
commit 6647f95be4
28 changed files with 931 additions and 724 deletions
+8 -2
View File
@@ -5,7 +5,7 @@ PROTO_FILE := $(PROTO_DIR)/logtail.proto
GEN_FILES := proto/logtailpb/logtail.pb.go proto/logtailpb/logtail_grpc.pb.go
NATIVE_ARCH := $(shell go env GOARCH)
VERSION := 0.9.2
VERSION := 1.0.1
COMMIT_HASH := $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE := $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -s -w \
@@ -28,7 +28,7 @@ GO_VERSION ?= 1.24.6
# accepts. Older releases can't parse recent Go syntax.
GOLANGCI_LINT_VERSION ?= 1.64.0
.PHONY: help all build build-amd64 build-arm64 test lint proto pkg-deb docker docker-push clean \
.PHONY: help all build build-amd64 build-arm64 test lint fixstyle proto pkg-deb docker docker-push clean \
install-deps install-deps-apt install-deps-go install-deps-go-tools
# help is the default target. Keep the list aligned with the .PHONY block above.
@@ -40,6 +40,7 @@ help:
@echo " build-arm64 build all four binaries for linux/arm64 into build/arm64/"
@echo " test run 'go test ./...'"
@echo " lint run 'golangci-lint run ./...'"
@echo " fixstyle rewrite all .go files in place with 'gofmt -w'"
@echo " proto regenerate proto/logtailpb/*.pb.go from proto/logtail.proto"
@echo " pkg-deb build amd64 + arm64 .deb packages (requires dpkg-deb)"
@echo " docker buildx --load two tags ($(IMAGE):v$(VERSION), $(IMAGE):latest) for the native arch"
@@ -74,6 +75,11 @@ test: $(GEN_FILES)
lint:
golangci-lint run ./...
# fixstyle rewrites all tracked .go files with gofmt. Generated proto files are
# excluded — they are regenerated from .proto via `make proto`.
fixstyle:
gofmt -w $(shell find . -name '*.go' -not -path './proto/logtailpb/*')
proto: $(GEN_FILES)
# protoc's go_package option places output at the go_package path (not source-relative).
+3 -3
View File
@@ -262,7 +262,7 @@ func startFakeCollector(t *testing.T, snaps []*pb.Snapshot) string {
}
srv := grpc.NewServer()
pb.RegisterLogtailServiceServer(srv, &fakeCollector{snaps: snaps})
go srv.Serve(lis)
go func() { _ = srv.Serve(lis) }()
t.Cleanup(srv.Stop)
return lis.Addr().String()
}
@@ -310,7 +310,7 @@ func TestGRPCEndToEnd(t *testing.T) {
}
grpcSrv := grpc.NewServer()
pb.RegisterLogtailServiceServer(grpcSrv, NewServer(cache, "agg-test", NewTargetRegistry(nil)))
go grpcSrv.Serve(lis)
go func() { _ = grpcSrv.Serve(lis) }()
defer grpcSrv.Stop()
conn, err := grpc.NewClient(lis.Addr().String(),
@@ -318,7 +318,7 @@ func TestGRPCEndToEnd(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer conn.Close()
defer func() { _ = conn.Close() }()
client := pb.NewLogtailServiceClient(conn)
qctx, qcancel := context.WithTimeout(context.Background(), 5*time.Second)
defer qcancel()
+1 -1
View File
@@ -84,7 +84,7 @@ func dumpCollector(ctx context.Context, addr string) (fine, coarse []st.Snapshot
if err != nil {
return nil, nil, err
}
defer conn.Close()
defer func() { _ = conn.Close() }()
client := pb.NewLogtailServiceClient(conn)
stream, err := client.DumpSnapshots(ctx, &pb.DumpSnapshotsRequest{})
+2 -6
View File
@@ -97,15 +97,11 @@ func (c *Cache) LoadHistorical(fine, coarse []st.Snapshot) {
c.mu.Lock()
defer c.mu.Unlock()
for i, snap := range fine {
c.fineRing[i] = snap
}
copy(c.fineRing[:], fine)
c.fineFilled = len(fine)
c.fineHead = len(fine) % st.FineRingSize
for i, snap := range coarse {
c.coarseRing[i] = snap
}
copy(c.coarseRing[:], coarse)
c.coarseFilled = len(coarse)
c.coarseHead = len(coarse) % st.CoarseRingSize
}
+1 -1
View File
@@ -77,7 +77,7 @@ func (cs *CollectorSub) stream(ctx context.Context) (bool, error) {
if err != nil {
return false, err
}
defer conn.Close()
defer func() { _ = conn.Close() }()
client := pb.NewLogtailServiceClient(conn)
stream, err := client.StreamSnapshots(ctx, &pb.SnapshotRequest{})
+1 -29
View File
@@ -1,7 +1,6 @@
package main
import (
"bytes"
"context"
"encoding/json"
"net"
@@ -11,7 +10,6 @@ import (
pb "git.ipng.ch/ipng/nginx-logtail/proto/logtailpb"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
)
// --- Unit tests ---
@@ -149,21 +147,11 @@ func startFake(t *testing.T, fs *fakeServer) string {
}
srv := grpc.NewServer()
pb.RegisterLogtailServiceServer(srv, fs)
go srv.Serve(lis)
go func() { _ = srv.Serve(lis) }()
t.Cleanup(srv.GracefulStop)
return lis.Addr().String()
}
func dialTest(t *testing.T, addr string) pb.LogtailServiceClient {
t.Helper()
conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials()))
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { conn.Close() })
return pb.NewLogtailServiceClient(conn)
}
// --- TopN tests ---
func TestTopNSingleTarget(t *testing.T) {
@@ -225,23 +213,7 @@ func TestTopNJSON(t *testing.T) {
})
results := fanOutTopN([]string{addr}, nil, pb.GroupBy_WEBSITE, 10, pb.Window_W5M)
var buf bytes.Buffer
// Redirect stdout not needed; call JSON formatter directly.
r := results[0]
// Build expected JSON by calling printTopNJSON with a captured stdout.
// We test indirectly: marshal manually and compare fields.
type entry struct {
Label string `json:"label"`
Count int64 `json:"count"`
}
type out struct {
Source string `json:"source"`
Target string `json:"target"`
Entries []entry `json:"entries"`
}
_ = buf
_ = r
// Verify the response fields are correct for JSON serialization.
if r.resp.Source != "agg" {
t.Errorf("source = %q", r.resp.Source)
}
+2 -2
View File
@@ -24,7 +24,7 @@ type streamEvent struct {
func runStream(args []string) {
fs := flag.NewFlagSet("stream", flag.ExitOnError)
sf, targetFlag := bindShared(fs)
fs.Parse(args)
_ = fs.Parse(args) // ExitOnError: only returns nil here
sf.resolve(*targetFlag)
filter := buildFilter(sf)
@@ -85,7 +85,7 @@ func streamOnce(ctx context.Context, addr string, filter *pb.Filter, events chan
if err != nil {
return err
}
defer conn.Close()
defer func() { _ = conn.Close() }()
stream, err := client.StreamSnapshots(ctx, &pb.SnapshotRequest{})
if err != nil {
+3 -3
View File
@@ -18,7 +18,7 @@ func runTargets(args []string) {
fmt.Fprintln(os.Stderr, "usage: logtail-cli targets [--target host:port] [--json]")
fs.PrintDefaults()
}
fs.Parse(args)
_ = fs.Parse(args) // ExitOnError: only returns nil here
sf.resolve(*target)
for _, addr := range sf.targets {
@@ -30,7 +30,7 @@ func runTargets(args []string) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
resp, err := client.ListTargets(ctx, &pb.ListTargetsRequest{})
cancel()
conn.Close()
_ = conn.Close()
if err != nil {
fmt.Fprintf(os.Stderr, "targets: %s: %v\n", addr, err)
continue
@@ -43,7 +43,7 @@ func runTargets(args []string) {
Addr string `json:"addr"`
}
for _, t := range resp.Targets {
json.NewEncoder(os.Stdout).Encode(row{QueryTarget: addr, Name: t.Name, Addr: t.Addr})
_ = json.NewEncoder(os.Stdout).Encode(row{QueryTarget: addr, Name: t.Name, Addr: t.Addr})
}
} else {
if len(sf.targets) > 1 {
+2 -2
View File
@@ -24,7 +24,7 @@ func runTopN(args []string) {
n := fs.Int("n", 10, "number of entries")
window := fs.String("window", "5m", "time window: 1m 5m 15m 60m 6h 24h")
groupBy := fs.String("group-by", "website", "group by: website prefix uri status")
fs.Parse(args)
_ = fs.Parse(args) // ExitOnError: only returns nil here
sf.resolve(*targetFlag)
win := parseWindow(*window)
@@ -66,7 +66,7 @@ func fanOutTopN(targets []string, filter *pb.Filter, groupBy pb.GroupBy, n int,
results[i].resp = &pb.TopNResponse{}
return
}
defer conn.Close()
defer func() { _ = conn.Close() }()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
resp, err := client.TopN(ctx, &pb.TopNRequest{
+2 -2
View File
@@ -22,7 +22,7 @@ func runTrend(args []string) {
fs := flag.NewFlagSet("trend", flag.ExitOnError)
sf, targetFlag := bindShared(fs)
window := fs.String("window", "5m", "time window: 1m 5m 15m 60m 6h 24h")
fs.Parse(args)
_ = fs.Parse(args) // ExitOnError: only returns nil here
sf.resolve(*targetFlag)
win := parseWindow(*window)
@@ -63,7 +63,7 @@ func fanOutTrend(targets []string, filter *pb.Filter, window pb.Window) []trendR
results[i].resp = &pb.TrendResponse{}
return
}
defer conn.Close()
defer func() { _ = conn.Close() }()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
resp, err := client.Trend(ctx, &pb.TrendRequest{
-6
View File
@@ -167,9 +167,3 @@ func parseGroupBy(s string) pb.GroupBy {
panic("unreachable")
}
}
func dieUsage(fs *flag.FlagSet, msg string) {
fmt.Fprintln(os.Stderr, msg)
fs.PrintDefaults()
os.Exit(1)
}
+3 -3
View File
@@ -16,17 +16,17 @@ func printTable(w io.Writer, rows [][]string) {
}
tw := tabwriter.NewWriter(w, 0, 0, 2, ' ', 0)
for i, row := range rows {
fmt.Fprintln(tw, strings.Join(row, "\t"))
_, _ = fmt.Fprintln(tw, strings.Join(row, "\t"))
if i == 0 {
// Print a divider matching the header width.
dashes := make([]string, len(row))
for j, h := range row {
dashes[j] = strings.Repeat("-", len(h))
}
fmt.Fprintln(tw, strings.Join(dashes, "\t"))
_, _ = fmt.Fprintln(tw, strings.Join(dashes, "\t"))
}
}
tw.Flush()
_ = tw.Flush()
}
// fmtCount formats a count with a space as the thousands separator.
+1 -1
View File
@@ -128,7 +128,7 @@ func collectPatterns(logPaths, logsFile string) []string {
if err != nil {
log.Fatalf("collector: cannot open --logs-file %s: %v", logsFile, err)
}
defer f.Close()
defer func() { _ = f.Close() }()
sc := bufio.NewScanner(f)
for sc.Scan() {
if p := strings.TrimSpace(sc.Text()); p != "" && !strings.HasPrefix(p, "#") {
+82 -53
View File
@@ -8,6 +8,13 @@ import (
)
// LogRecord holds the dimensions extracted from a single nginx log line.
//
// BytesSent carries $body_bytes_sent for v1 records and $bytes_sent for v2
// records — operators see a small step up when emitters move to v2 because v2
// includes header overhead.
//
// RequestLength, UpstreamResponseTime, UpstreamStatus, HasUpstream are v2-only.
// In v1 records HasUpstream is always false and the related fields are zero.
type LogRecord struct {
Website string
ClientPrefix string
@@ -16,79 +23,42 @@ type LogRecord struct {
IsTor bool
ASN int32
Method string
BodyBytesSent int64
BytesSent int64
RequestLength int64
RequestTime float64
UpstreamResponseTime float64
UpstreamStatus string
HasUpstream bool
SourceTag string
}
// fileSourceTag is the SourceTag assigned to records read from on-disk log
// files, which pre-date the tag concept. Mirrors nginx's fallback label.
const fileSourceTag = "direct"
// ParseLine parses a tab-separated logtail log line from a file:
//
// $host \t $remote_addr \t $msec \t $request_method \t $request_uri \t $status \t $body_bytes_sent \t $request_time \t $is_tor \t $asn
//
// The is_tor (field 9) and asn (field 10) fields are optional for backward
// compatibility with older log files that omit them; they default to false/0
// when absent. SourceTag is always set to "direct" (file origin has no tag).
// Returns false for lines with fewer than 8 fields.
// ParseLine parses a versioned nginx-logtail line. Both file ingest and UDP
// ingest funnel through here. Every line MUST start with "v<N>\t"; unknown or
// missing versions return false so operators can ship a parser update before
// the emitter switches.
func ParseLine(line string, v4bits, v6bits int) (LogRecord, bool) {
fields := strings.SplitN(line, "\t", 10)
if len(fields) < 8 {
return LogRecord{}, false
}
prefix, ok := truncateIP(fields[1], v4bits, v6bits)
if !ok {
return LogRecord{}, false
}
isTor := len(fields) >= 9 && fields[8] == "1"
var asn int32
if len(fields) == 10 {
if n, err := strconv.ParseInt(fields[9], 10, 32); err == nil {
asn = int32(n)
}
}
return LogRecord{
Website: fields[0],
ClientPrefix: prefix,
URI: stripQuery(fields[4]),
Status: fields[5],
IsTor: isTor,
ASN: asn,
Method: fields[3],
BodyBytesSent: parseInt(fields[6]),
RequestTime: parseFloat(fields[7]),
SourceTag: fileSourceTag,
}, true
}
// ParseUDPLine dispatches on the version prefix emitted by
// nginx-ipng-stats-plugin's ipng_stats_logtail directive. The wire format is
// "v<N>\t<payload>", where <payload> is version-specific. Unknown or missing
// versions return false so operators can roll out a v2 parser before
// upgrading emitters.
func ParseUDPLine(line string, v4bits, v6bits int) (LogRecord, bool) {
i := strings.IndexByte(line, '\t')
if i < 0 {
return LogRecord{}, false
}
switch line[:i] {
case "v1":
return parseUDPLineV1(line[i+1:], v4bits, v6bits)
return parseV1(line[i+1:], v4bits, v6bits)
case "v2":
return parseV2(line[i+1:], v4bits, v6bits)
default:
return LogRecord{}, false
}
}
// parseUDPLineV1 parses the v1 payload (12 tab-separated fields):
// parseV1 parses the v1 payload (12 tab-separated fields):
//
// $host \t $remote_addr \t $request_method \t $request_uri \t $status \t
// $body_bytes_sent \t $request_time \t $is_tor \t $asn \t
// $ipng_source_tag \t $server_addr \t $scheme
//
// server_addr and scheme are parsed but discarded.
func parseUDPLineV1(payload string, v4bits, v6bits int) (LogRecord, bool) {
// $server_addr and $scheme are parsed but discarded.
func parseV1(payload string, v4bits, v6bits int) (LogRecord, bool) {
fields := strings.Split(payload, "\t")
if len(fields) != 12 {
return LogRecord{}, false
@@ -109,12 +79,71 @@ func parseUDPLineV1(payload string, v4bits, v6bits int) (LogRecord, bool) {
IsTor: fields[7] == "1",
ASN: asn,
Method: fields[2],
BodyBytesSent: parseInt(fields[5]),
BytesSent: parseInt(fields[5]),
RequestTime: parseFloat(fields[6]),
SourceTag: fields[9],
}, true
}
// parseV2 parses the v2 payload (15 tab-separated fields):
//
// $host \t $remote_addr \t $request_method \t $request_uri \t $status \t
// $bytes_sent \t $request_length \t $request_time \t
// $upstream_response_time \t $upstream_status \t
// $is_tor \t $asn \t $ipng_source_tag \t $server_addr \t $scheme
//
// $upstream_response_time and $upstream_status are "-" (or empty) when nginx
// served the response directly — HasUpstream is left false in that case.
// When nginx retried across multiple upstreams the fields are comma-separated;
// the parser keeps the last entry, since that's the upstream that actually
// served the response. $server_addr and $scheme are parsed but discarded.
func parseV2(payload string, v4bits, v6bits int) (LogRecord, bool) {
fields := strings.Split(payload, "\t")
if len(fields) != 15 {
return LogRecord{}, false
}
prefix, ok := truncateIP(fields[1], v4bits, v6bits)
if !ok {
return LogRecord{}, false
}
var asn int32
if n, err := strconv.ParseInt(fields[11], 10, 32); err == nil {
asn = int32(n)
}
r := LogRecord{
Website: fields[0],
ClientPrefix: prefix,
URI: stripQuery(fields[3]),
Status: fields[4],
IsTor: fields[10] == "1",
ASN: asn,
Method: fields[2],
BytesSent: parseInt(fields[5]),
RequestLength: parseInt(fields[6]),
RequestTime: parseFloat(fields[7]),
SourceTag: fields[12],
}
if fields[8] != "-" && fields[8] != "" {
timeStr := lastCommaPart(fields[8])
statusStr := lastCommaPart(fields[9])
if t, err := strconv.ParseFloat(timeStr, 64); err == nil {
r.UpstreamResponseTime = t
r.UpstreamStatus = statusStr
r.HasUpstream = true
}
}
return r, true
}
// lastCommaPart returns the substring after the last ", " (nginx's separator
// for retried upstreams). Plain values pass through unchanged.
func lastCommaPart(s string) string {
if i := strings.LastIndex(s, ", "); i >= 0 {
return s[i+2:]
}
return s
}
func stripQuery(uri string) string {
if i := strings.IndexByte(uri, '?'); i >= 0 {
return uri[:i]
+147 -228
View File
@@ -4,215 +4,7 @@ import (
"testing"
)
func TestParseLine(t *testing.T) {
good := "www.example.com\t1.2.3.4\t1741954800.123\tGET\t/api/v1/search?q=foo&x=1\t200\t1452\t0.043"
tests := []struct {
name string
line string
wantOK bool
want LogRecord
}{
{
name: "normal IPv4 line strips query string",
line: good,
wantOK: true,
want: LogRecord{
Website: "www.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/api/v1/search",
Status: "200",
Method: "GET",
BodyBytesSent: 1452,
RequestTime: 0.043,
SourceTag: "direct",
},
},
{
name: "URI with no query string",
line: "host\t10.0.0.1\t0\tPOST\t/submit\t201\t0\t0.001",
wantOK: true,
want: LogRecord{
Website: "host",
ClientPrefix: "10.0.0.0/24",
URI: "/submit",
Status: "201",
Method: "POST",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "IPv6 address truncated to /48",
line: "host\t2001:db8:cafe::1\t0\tGET\t/\t200\t0\t0.001",
wantOK: true,
want: LogRecord{
Website: "host",
ClientPrefix: "2001:db8:cafe::/48",
URI: "/",
Status: "200",
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "too few fields returns false",
line: "host\t1.2.3.4\t0\tGET\t/",
wantOK: false,
},
{
name: "empty line returns false",
line: "",
wantOK: false,
},
{
name: "invalid IP returns false",
line: "host\tnot-an-ip\t0\tGET\t/\t200\t0\t0.001",
wantOK: false,
},
{
name: "status 429",
line: "api.example.com\t5.6.7.8\t0\tGET\t/rate-limited\t429\t0\t0.001",
wantOK: true,
want: LogRecord{
Website: "api.example.com",
ClientPrefix: "5.6.7.0/24",
URI: "/rate-limited",
Status: "429",
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "is_tor=1 sets IsTor true",
line: "tor.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t1",
wantOK: true,
want: LogRecord{
Website: "tor.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: true,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "is_tor=0 sets IsTor false",
line: "normal.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t0",
wantOK: true,
want: LogRecord{
Website: "normal.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: false,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "missing is_tor field defaults to false (backward compat)",
line: "old.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001",
wantOK: true,
want: LogRecord{
Website: "old.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: false,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "asn field parsed",
line: "asn.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t0\t12345",
wantOK: true,
want: LogRecord{
Website: "asn.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: false,
ASN: 12345,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "asn field with is_tor=1",
line: "both.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t1\t65535",
wantOK: true,
want: LogRecord{
Website: "both.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: true,
ASN: 65535,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "missing asn field defaults to 0 (backward compat)",
line: "noasn.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t1",
wantOK: true,
want: LogRecord{
Website: "noasn.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: true,
ASN: 0,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "invalid asn field defaults to 0",
line: "badann.example.com\t1.2.3.4\t0\tGET\t/\t200\t0\t0.001\t0\tnot-a-number",
wantOK: true,
want: LogRecord{
Website: "badann.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
IsTor: false,
ASN: 0,
Method: "GET",
RequestTime: 0.001,
SourceTag: "direct",
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, ok := ParseLine(tc.line, 24, 48)
if ok != tc.wantOK {
t.Fatalf("ParseLine ok=%v, want %v", ok, tc.wantOK)
}
if !tc.wantOK {
return
}
if got != tc.want {
t.Errorf("got %+v, want %+v", got, tc.want)
}
})
}
}
func TestParseUDPLine(t *testing.T) {
func TestParseLineV1(t *testing.T) {
// v1 \t host \t remote_addr \t method \t uri \t status \t body_bytes \t req_time \t
// is_tor \t asn \t source_tag \t server_addr \t scheme
good := "v1\twww.example.com\t1.2.3.4\tGET\t/api/v1/search?q=foo\t200\t1452\t0.043\t0\t12345\tcdn\t10.0.0.1\thttps"
@@ -235,7 +27,7 @@ func TestParseUDPLine(t *testing.T) {
IsTor: false,
ASN: 12345,
Method: "GET",
BodyBytesSent: 1452,
BytesSent: 1452,
RequestTime: 0.043,
SourceTag: "cdn",
},
@@ -252,7 +44,7 @@ func TestParseUDPLine(t *testing.T) {
IsTor: true,
ASN: 65535,
Method: "GET",
BodyBytesSent: 0,
BytesSent: 0,
RequestTime: 0,
SourceTag: "direct",
},
@@ -272,28 +64,13 @@ func TestParseUDPLine(t *testing.T) {
line: "v1\th\tnope\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1\thttp",
wantOK: false,
},
{
name: "unknown version rejected (future v2)",
line: "v2\twww.example.com\t1.2.3.4\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1\thttp",
wantOK: false,
},
{
name: "missing version prefix rejected (legacy 12-field line)",
line: "www.example.com\t1.2.3.4\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1\thttp",
wantOK: false,
},
{
name: "no tab at all rejected",
line: "v1",
wantOK: false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, ok := ParseUDPLine(tc.line, 24, 48)
got, ok := ParseLine(tc.line, 24, 48)
if ok != tc.wantOK {
t.Fatalf("ParseUDPLine ok=%v, want %v; got=%+v", ok, tc.wantOK, got)
t.Fatalf("ParseLine ok=%v, want %v; got=%+v", ok, tc.wantOK, got)
}
if !tc.wantOK {
return
@@ -305,6 +82,148 @@ func TestParseUDPLine(t *testing.T) {
}
}
func TestParseLineV2(t *testing.T) {
// v2 \t host \t remote_addr \t method \t uri \t status \t bytes_sent \t request_length \t
// request_time \t upstream_response_time \t upstream_status \t is_tor \t asn \t
// source_tag \t server_addr \t scheme
full := "v2\twww.example.com\t1.2.3.4\tGET\t/api/v1/search?q=foo\t200\t1500\t421\t0.043\t0.012\t200\t0\t12345\tcdn\t10.0.0.1\thttps"
tests := []struct {
name string
line string
wantOK bool
want LogRecord
}{
{
name: "v2 with upstream",
line: full,
wantOK: true,
want: LogRecord{
Website: "www.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/api/v1/search",
Status: "200",
ASN: 12345,
Method: "GET",
BytesSent: 1500,
RequestLength: 421,
RequestTime: 0.043,
UpstreamResponseTime: 0.012,
UpstreamStatus: "200",
HasUpstream: true,
SourceTag: "cdn",
},
},
{
name: "v2 no upstream (dash sentinels)",
line: "v2\twww.example.com\t1.2.3.4\tGET\t/static.html\t200\t900\t300\t0.001\t-\t-\t0\t0\tdirect\t10.0.0.1\thttps",
wantOK: true,
want: LogRecord{
Website: "www.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/static.html",
Status: "200",
Method: "GET",
BytesSent: 900,
RequestLength: 300,
RequestTime: 0.001,
SourceTag: "direct",
},
},
{
name: "v2 no upstream (empty fields)",
line: "v2\thh\t1.2.3.4\tGET\t/\t301\t200\t100\t0\t\t\t0\t0\tdirect\t10.0.0.1\thttps",
wantOK: true,
want: LogRecord{
Website: "hh",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "301",
Method: "GET",
BytesSent: 200,
RequestLength: 100,
SourceTag: "direct",
},
},
{
name: "v2 retried upstreams (comma-separated, last wins)",
line: "v2\twww.example.com\t1.2.3.4\tGET\t/api\t502\t900\t300\t1.500\t0.500, 1.000\t504, 502\t0\t0\tcdn\t10.0.0.1\thttps",
wantOK: true,
want: LogRecord{
Website: "www.example.com",
ClientPrefix: "1.2.3.0/24",
URI: "/api",
Status: "502",
Method: "GET",
BytesSent: 900,
RequestLength: 300,
RequestTime: 1.500,
UpstreamResponseTime: 1.000,
UpstreamStatus: "502",
HasUpstream: true,
SourceTag: "cdn",
},
},
{
name: "v2 wrong field count (14) rejected",
line: "v2\twww.example.com\t1.2.3.4\tGET\t/\t200\t0\t0\t0\t-\t-\t0\t0\tcdn\t10.0.0.1",
wantOK: false,
},
{
name: "v2 bad IP rejected",
line: "v2\thh\tnope\tGET\t/\t200\t0\t0\t0\t-\t-\t0\t0\tcdn\t10.0.0.1\thttps",
wantOK: false,
},
{
name: "v2 bad upstream time leaves HasUpstream=false",
line: "v2\thh\t1.2.3.4\tGET\t/\t200\t0\t0\t0\tnotanumber\t200\t0\t0\tcdn\t10.0.0.1\thttps",
wantOK: true,
want: LogRecord{
Website: "hh",
ClientPrefix: "1.2.3.0/24",
URI: "/",
Status: "200",
Method: "GET",
SourceTag: "cdn",
},
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
got, ok := ParseLine(tc.line, 24, 48)
if ok != tc.wantOK {
t.Fatalf("ParseLine ok=%v, want %v; got=%+v", ok, tc.wantOK, got)
}
if !tc.wantOK {
return
}
if got != tc.want {
t.Errorf("got %+v, want %+v", got, tc.want)
}
})
}
}
func TestParseLineRejections(t *testing.T) {
tests := []struct {
name string
line string
}{
{"empty line", ""},
{"no tab at all", "v1"},
{"unknown version v3", "v3\twww.example.com\t1.2.3.4\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1\thttp"},
{"missing version prefix (legacy file format)", "www.example.com\t1.2.3.4\t1741954800.123\tGET\t/api\t200\t1452\t0.043"},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
if _, ok := ParseLine(tc.line, 24, 48); ok {
t.Errorf("expected rejection for %q", tc.line)
}
})
}
}
func TestTruncateIP(t *testing.T) {
tests := []struct {
addr string
+251 -134
View File
@@ -14,27 +14,46 @@ const promNumBodyBounds = 7
var promBodyBounds = [promNumBodyBounds]int64{256, 1024, 4096, 16384, 65536, 262144, 1048576}
// Request-time histogram bucket upper bounds in seconds (standard Prometheus defaults).
// Duration histogram bucket upper bounds in seconds (Prometheus defaults).
const promNumTimeBounds = 11
var promTimeBounds = [promNumTimeBounds]float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10}
const promCounterCap = 250_000 // safety cap on {host,method,status} counter entries
// promCounterKey is the label set for per-request counters.
// promCounterKey is the label set for the per-request counter.
type promCounterKey struct {
Host string
Method string
Status string
}
// promBodyEntry holds the body_bytes_sent histogram for one host.
// hostSourceKey labels histograms by {host, source_tag}.
type hostSourceKey struct {
Host string
SourceTag string
}
// sourceClassKey labels the source-tag rollup counter.
type sourceClassKey struct {
SourceTag string
StatusClass string
}
// upstreamKey labels the upstream-only request counter.
type upstreamKey struct {
Host string
SourceTag string
StatusClass string // class of $upstream_status
}
// promBodyEntry holds one body-size histogram (one label-set worth).
type promBodyEntry struct {
buckets [promNumBodyBounds + 1]int64 // indices 0..N-1: le=bound[i]; index N: le=+Inf
sum int64
}
// promTimeEntry holds the request_time histogram for one host.
// promTimeEntry holds one duration histogram (one label-set worth).
type promTimeEntry struct {
buckets [promNumTimeBounds + 1]int64
sum float64
@@ -47,28 +66,29 @@ type promTimeEntry struct {
type PromStore struct {
mu sync.Mutex
counters map[promCounterKey]int64
body map[string]*promBodyEntry // keyed by host
reqTime map[string]*promTimeEntry // keyed by host
bytesSent map[hostSourceKey]*promBodyEntry
requestDuration map[hostSourceKey]*promTimeEntry
requestBytes map[hostSourceKey]*promBodyEntry // v2 only
upstreamDuration map[hostSourceKey]*promTimeEntry // v2 only
upstreamCounters map[upstreamKey]int64 // v2 only
sourceCounters map[sourceClassKey]int64
// per-source_tag rollups (parallel series, not a cross-product with host)
sourceCounters map[string]int64 // keyed by source_tag
sourceBody map[string]*promBodyEntry // keyed by source_tag
// UDP ingest counters — protected by their own atomic-friendly lock.
udpMu sync.Mutex
udpPacketsReceived int64 // datagrams read off the socket
udpLoglinesSuccess int64 // successfully parsed
udpLoglinesConsumed int64 // successfully forwarded to the store channel
udpPacketsReceived int64
udpLoglinesSuccess int64
udpLoglinesConsumed int64
}
// NewPromStore returns an empty PromStore ready for use.
func NewPromStore() *PromStore {
return &PromStore{
counters: make(map[promCounterKey]int64, 1024),
body: make(map[string]*promBodyEntry, 64),
reqTime: make(map[string]*promTimeEntry, 64),
sourceCounters: make(map[string]int64, 32),
sourceBody: make(map[string]*promBodyEntry, 32),
bytesSent: make(map[hostSourceKey]*promBodyEntry, 64),
requestDuration: make(map[hostSourceKey]*promTimeEntry, 64),
requestBytes: make(map[hostSourceKey]*promBodyEntry, 64),
upstreamDuration: make(map[hostSourceKey]*promTimeEntry, 64),
upstreamCounters: make(map[upstreamKey]int64, 64),
sourceCounters: make(map[sourceClassKey]int64, 32),
}
}
@@ -76,8 +96,11 @@ func NewPromStore() *PromStore {
// Must be called from a single goroutine.
func (p *PromStore) Ingest(r LogRecord) {
p.mu.Lock()
defer p.mu.Unlock()
// --- per-{host,method,status} request counter ---
hsk := hostSourceKey{Host: r.Website, SourceTag: r.SourceTag}
// nginx_http_requests_total{host,method,status} — capped.
ck := promCounterKey{Host: r.Website, Method: r.Method, Status: r.Status}
if _, ok := p.counters[ck]; ok {
p.counters[ck]++
@@ -85,37 +108,54 @@ func (p *PromStore) Ingest(r LogRecord) {
p.counters[ck] = 1
}
// --- body_bytes_sent histogram (keyed by host only) ---
observeBody(p.body, r.Website, r.BodyBytesSent)
// --- request_time histogram (keyed by host only) ---
te, ok := p.reqTime[r.Website]
if !ok {
te = &promTimeEntry{}
p.reqTime[r.Website] = te
}
for i, bound := range promTimeBounds {
if r.RequestTime <= bound {
te.buckets[i]++
}
}
te.buckets[promNumTimeBounds]++ // +Inf
te.sum += r.RequestTime
// --- per-source_tag rollups ---
p.sourceCounters[r.SourceTag]++
observeBody(p.sourceBody, r.SourceTag, r.BodyBytesSent)
p.mu.Unlock()
observeBody(p.bytesSent, hsk, r.BytesSent)
observeTime(p.requestDuration, hsk, r.RequestTime)
if r.RequestLength > 0 {
observeBody(p.requestBytes, hsk, r.RequestLength)
}
// IncUDPPacket, IncUDPSuccess, and IncUDPConsumed bump their respective
// UDP ingest counters. They are called from the UDP listener goroutine.
p.sourceCounters[sourceClassKey{
SourceTag: r.SourceTag,
StatusClass: statusClass(r.Status),
}]++
if r.HasUpstream {
observeTime(p.upstreamDuration, hsk, r.UpstreamResponseTime)
p.upstreamCounters[upstreamKey{
Host: r.Website,
SourceTag: r.SourceTag,
StatusClass: statusClass(r.UpstreamStatus),
}]++
}
}
// IncUDPPacket, IncUDPSuccess, IncUDPConsumed bump UDP-ingest counters from
// the listener goroutine.
func (p *PromStore) IncUDPPacket() { p.udpMu.Lock(); p.udpPacketsReceived++; p.udpMu.Unlock() }
func (p *PromStore) IncUDPSuccess() { p.udpMu.Lock(); p.udpLoglinesSuccess++; p.udpMu.Unlock() }
func (p *PromStore) IncUDPConsumed() { p.udpMu.Lock(); p.udpLoglinesConsumed++; p.udpMu.Unlock() }
func observeBody(m map[string]*promBodyEntry, key string, bytes int64) {
// statusClass folds an HTTP status code into 2xx/3xx/4xx/5xx, with anything
// else falling to "other" (including empty input).
func statusClass(status string) string {
if status == "" {
return "other"
}
switch status[0] {
case '2':
return "2xx"
case '3':
return "3xx"
case '4':
return "4xx"
case '5':
return "5xx"
default:
return "other"
}
}
func observeBody(m map[hostSourceKey]*promBodyEntry, key hostSourceKey, bytes int64) {
e, ok := m[key]
if !ok {
e = &promBodyEntry{}
@@ -126,53 +166,77 @@ func observeBody(m map[string]*promBodyEntry, key string, bytes int64) {
e.buckets[i]++
}
}
e.buckets[promNumBodyBounds]++ // +Inf
e.buckets[promNumBodyBounds]++
e.sum += bytes
}
func observeTime(m map[hostSourceKey]*promTimeEntry, key hostSourceKey, seconds float64) {
e, ok := m[key]
if !ok {
e = &promTimeEntry{}
m[key] = e
}
for i, bound := range promTimeBounds {
if seconds <= bound {
e.buckets[i]++
}
}
e.buckets[promNumTimeBounds]++
e.sum += seconds
}
// ServeHTTP renders all metrics in the Prometheus text exposition format (0.0.4).
func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
// Snapshot everything under the lock, then render without holding it.
p.mu.Lock()
type counterSnap struct {
k promCounterKey
v int64
}
type bodySnap struct {
k hostSourceKey
e promBodyEntry
}
type timeSnap struct {
k hostSourceKey
e promTimeEntry
}
type upstreamCounterSnap struct {
k upstreamKey
v int64
}
type sourceCounterSnap struct {
k sourceClassKey
v int64
}
p.mu.Lock()
counters := make([]counterSnap, 0, len(p.counters))
for k, v := range p.counters {
counters = append(counters, counterSnap{k, v})
}
type bodySnap struct {
label string
e promBodyEntry
bytesSnaps := make([]bodySnap, 0, len(p.bytesSent))
for k, e := range p.bytesSent {
bytesSnaps = append(bytesSnaps, bodySnap{k, *e})
}
bodySnaps := make([]bodySnap, 0, len(p.body))
for h, e := range p.body {
bodySnaps = append(bodySnaps, bodySnap{h, *e})
requestBytesSnaps := make([]bodySnap, 0, len(p.requestBytes))
for k, e := range p.requestBytes {
requestBytesSnaps = append(requestBytesSnaps, bodySnap{k, *e})
}
type timeSnap struct {
host string
e promTimeEntry
requestDurationSnaps := make([]timeSnap, 0, len(p.requestDuration))
for k, e := range p.requestDuration {
requestDurationSnaps = append(requestDurationSnaps, timeSnap{k, *e})
}
timeSnaps := make([]timeSnap, 0, len(p.reqTime))
for h, e := range p.reqTime {
timeSnaps = append(timeSnaps, timeSnap{h, *e})
upstreamDurationSnaps := make([]timeSnap, 0, len(p.upstreamDuration))
for k, e := range p.upstreamDuration {
upstreamDurationSnaps = append(upstreamDurationSnaps, timeSnap{k, *e})
}
type sourceCounterSnap struct {
tag string
v int64
upstreamCounters := make([]upstreamCounterSnap, 0, len(p.upstreamCounters))
for k, v := range p.upstreamCounters {
upstreamCounters = append(upstreamCounters, upstreamCounterSnap{k, v})
}
sourceCounters := make([]sourceCounterSnap, 0, len(p.sourceCounters))
for t, v := range p.sourceCounters {
sourceCounters = append(sourceCounters, sourceCounterSnap{t, v})
}
sourceBodySnaps := make([]bodySnap, 0, len(p.sourceBody))
for t, e := range p.sourceBody {
sourceBodySnaps = append(sourceBodySnaps, bodySnap{t, *e})
for k, v := range p.sourceCounters {
sourceCounters = append(sourceCounters, sourceCounterSnap{k, v})
}
p.mu.Unlock()
@@ -183,7 +247,6 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
udpConsumed := p.udpLoglinesConsumed
p.udpMu.Unlock()
// Sort for stable, human-readable output.
sort.Slice(counters, func(i, j int) bool {
a, b := counters[i].k, counters[j].k
if a.Host != b.Host {
@@ -194,85 +257,139 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) {
}
return a.Status < b.Status
})
sort.Slice(bodySnaps, func(i, j int) bool { return bodySnaps[i].label < bodySnaps[j].label })
sort.Slice(timeSnaps, func(i, j int) bool { return timeSnaps[i].host < timeSnaps[j].host })
sort.Slice(sourceCounters, func(i, j int) bool { return sourceCounters[i].tag < sourceCounters[j].tag })
sort.Slice(sourceBodySnaps, func(i, j int) bool { return sourceBodySnaps[i].label < sourceBodySnaps[j].label })
sortBody := func(s []bodySnap) {
sort.Slice(s, func(i, j int) bool {
a, b := s[i].k, s[j].k
if a.Host != b.Host {
return a.Host < b.Host
}
return a.SourceTag < b.SourceTag
})
}
sortTime := func(s []timeSnap) {
sort.Slice(s, func(i, j int) bool {
a, b := s[i].k, s[j].k
if a.Host != b.Host {
return a.Host < b.Host
}
return a.SourceTag < b.SourceTag
})
}
sortBody(bytesSnaps)
sortBody(requestBytesSnaps)
sortTime(requestDurationSnaps)
sortTime(upstreamDurationSnaps)
sort.Slice(upstreamCounters, func(i, j int) bool {
a, b := upstreamCounters[i].k, upstreamCounters[j].k
if a.Host != b.Host {
return a.Host < b.Host
}
if a.SourceTag != b.SourceTag {
return a.SourceTag < b.SourceTag
}
return a.StatusClass < b.StatusClass
})
sort.Slice(sourceCounters, func(i, j int) bool {
a, b := sourceCounters[i].k, sourceCounters[j].k
if a.SourceTag != b.SourceTag {
return a.SourceTag < b.SourceTag
}
return a.StatusClass < b.StatusClass
})
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
bw := bufio.NewWriterSize(w, 256*1024)
// nginx_http_requests_total
fmt.Fprintln(bw, "# HELP nginx_http_requests_total Total number of HTTP requests processed.")
fmt.Fprintln(bw, "# TYPE nginx_http_requests_total counter")
// pf, pln are short helpers so the metric block reads cleanly. Errors on a
// bufio writer wrapping http.ResponseWriter mean the client disconnected;
// there's nothing useful to do mid-write — the next call will simply no-op.
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
pln := func(s string) { _, _ = fmt.Fprintln(bw, s) }
pln("# HELP nginx_http_requests_total Total number of HTTP requests processed.")
pln("# TYPE nginx_http_requests_total counter")
for _, c := range counters {
fmt.Fprintf(bw, "nginx_http_requests_total{host=%q,method=%q,status=%q} %d\n",
pf("nginx_http_requests_total{host=%q,method=%q,status=%q} %d\n",
c.k.Host, c.k.Method, c.k.Status, c.v)
}
// nginx_http_response_body_bytes (histogram, labeled by host)
fmt.Fprintln(bw, "# HELP nginx_http_response_body_bytes HTTP response body size distribution in bytes.")
fmt.Fprintln(bw, "# TYPE nginx_http_response_body_bytes histogram")
for _, s := range bodySnaps {
writeBodyHistogram(bw, "nginx_http_response_body_bytes", "host", s.label, s.e)
pln("# HELP nginx_http_bytes_sent HTTP response size distribution in bytes (body for v1 records, full wire bytes for v2).")
pln("# TYPE nginx_http_bytes_sent histogram")
for _, s := range bytesSnaps {
writeBodyHistogramHS(bw, "nginx_http_bytes_sent", s.k, s.e)
}
// nginx_http_request_duration_seconds (histogram, labeled by host)
fmt.Fprintln(bw, "# HELP nginx_http_request_duration_seconds HTTP request processing time in seconds.")
fmt.Fprintln(bw, "# TYPE nginx_http_request_duration_seconds histogram")
for _, s := range timeSnaps {
for i, bound := range promTimeBounds {
fmt.Fprintf(bw, "nginx_http_request_duration_seconds_bucket{host=%q,le=%q} %d\n",
s.host, formatFloat(bound), s.e.buckets[i])
}
fmt.Fprintf(bw, "nginx_http_request_duration_seconds_bucket{host=%q,le=\"+Inf\"} %d\n",
s.host, s.e.buckets[promNumTimeBounds])
fmt.Fprintf(bw, "nginx_http_request_duration_seconds_count{host=%q} %d\n",
s.host, s.e.buckets[promNumTimeBounds])
fmt.Fprintf(bw, "nginx_http_request_duration_seconds_sum{host=%q} %g\n",
s.host, s.e.sum)
pln("# HELP nginx_http_request_bytes HTTP request size distribution in bytes (v2 emitters only).")
pln("# TYPE nginx_http_request_bytes histogram")
for _, s := range requestBytesSnaps {
writeBodyHistogramHS(bw, "nginx_http_request_bytes", s.k, s.e)
}
// nginx_http_requests_by_source_total (counter, labeled by source_tag)
fmt.Fprintln(bw, "# HELP nginx_http_requests_by_source_total HTTP requests rolled up by nginx source tag.")
fmt.Fprintln(bw, "# TYPE nginx_http_requests_by_source_total counter")
pln("# HELP nginx_http_request_duration_seconds HTTP request processing time in seconds.")
pln("# TYPE nginx_http_request_duration_seconds histogram")
for _, s := range requestDurationSnaps {
writeTimeHistogramHS(bw, "nginx_http_request_duration_seconds", s.k, s.e)
}
pln("# HELP nginx_http_upstream_duration_seconds Upstream response time in seconds (v2 emitters only).")
pln("# TYPE nginx_http_upstream_duration_seconds histogram")
for _, s := range upstreamDurationSnaps {
writeTimeHistogramHS(bw, "nginx_http_upstream_duration_seconds", s.k, s.e)
}
pln("# HELP nginx_http_upstream_requests_total Requests served via an upstream, by upstream-status class (v2 emitters only).")
pln("# TYPE nginx_http_upstream_requests_total counter")
for _, c := range upstreamCounters {
pf("nginx_http_upstream_requests_total{host=%q,source_tag=%q,status_class=%q} %d\n",
c.k.Host, c.k.SourceTag, c.k.StatusClass, c.v)
}
pln("# HELP nginx_http_requests_by_source_total HTTP requests rolled up by source_tag and status class.")
pln("# TYPE nginx_http_requests_by_source_total counter")
for _, c := range sourceCounters {
fmt.Fprintf(bw, "nginx_http_requests_by_source_total{source_tag=%q} %d\n", c.tag, c.v)
pf("nginx_http_requests_by_source_total{source_tag=%q,status_class=%q} %d\n",
c.k.SourceTag, c.k.StatusClass, c.v)
}
// nginx_http_response_body_bytes_by_source (histogram, labeled by source_tag)
fmt.Fprintln(bw, "# HELP nginx_http_response_body_bytes_by_source HTTP response body size distribution by nginx source tag.")
fmt.Fprintln(bw, "# TYPE nginx_http_response_body_bytes_by_source histogram")
for _, s := range sourceBodySnaps {
writeBodyHistogram(bw, "nginx_http_response_body_bytes_by_source", "source_tag", s.label, s.e)
pln("# HELP logtail_udp_packets_received_total Datagrams read from the UDP socket.")
pln("# TYPE logtail_udp_packets_received_total counter")
pf("logtail_udp_packets_received_total %d\n", udpPackets)
pln("# HELP logtail_udp_loglines_success_total UDP loglines that parsed successfully.")
pln("# TYPE logtail_udp_loglines_success_total counter")
pf("logtail_udp_loglines_success_total %d\n", udpSuccess)
pln("# HELP logtail_udp_loglines_consumed_total UDP loglines forwarded to the store (not dropped).")
pln("# TYPE logtail_udp_loglines_consumed_total counter")
pf("logtail_udp_loglines_consumed_total %d\n", udpConsumed)
_ = bw.Flush()
}
// UDP ingest counters — lets operators distinguish parse failures
// (received - success) from channel-full drops (success - consumed).
fmt.Fprintln(bw, "# HELP logtail_udp_packets_received_total Datagrams read from the UDP socket.")
fmt.Fprintln(bw, "# TYPE logtail_udp_packets_received_total counter")
fmt.Fprintf(bw, "logtail_udp_packets_received_total %d\n", udpPackets)
fmt.Fprintln(bw, "# HELP logtail_udp_loglines_success_total UDP loglines that parsed successfully.")
fmt.Fprintln(bw, "# TYPE logtail_udp_loglines_success_total counter")
fmt.Fprintf(bw, "logtail_udp_loglines_success_total %d\n", udpSuccess)
fmt.Fprintln(bw, "# HELP logtail_udp_loglines_consumed_total UDP loglines forwarded to the store (not dropped).")
fmt.Fprintln(bw, "# TYPE logtail_udp_loglines_consumed_total counter")
fmt.Fprintf(bw, "logtail_udp_loglines_consumed_total %d\n", udpConsumed)
bw.Flush()
}
func writeBodyHistogram(bw *bufio.Writer, metric, labelName, labelValue string, e promBodyEntry) {
func writeBodyHistogramHS(bw *bufio.Writer, metric string, k hostSourceKey, e promBodyEntry) {
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
for i, bound := range promBodyBounds {
fmt.Fprintf(bw, "%s_bucket{%s=%q,le=%q} %d\n",
metric, labelName, labelValue, fmt.Sprintf("%d", bound), e.buckets[i])
pf("%s_bucket{host=%q,source_tag=%q,le=\"%d\"} %d\n",
metric, k.Host, k.SourceTag, bound, e.buckets[i])
}
fmt.Fprintf(bw, "%s_bucket{%s=%q,le=\"+Inf\"} %d\n",
metric, labelName, labelValue, e.buckets[promNumBodyBounds])
fmt.Fprintf(bw, "%s_count{%s=%q} %d\n",
metric, labelName, labelValue, e.buckets[promNumBodyBounds])
fmt.Fprintf(bw, "%s_sum{%s=%q} %d\n",
metric, labelName, labelValue, e.sum)
pf("%s_bucket{host=%q,source_tag=%q,le=\"+Inf\"} %d\n",
metric, k.Host, k.SourceTag, e.buckets[promNumBodyBounds])
pf("%s_count{host=%q,source_tag=%q} %d\n",
metric, k.Host, k.SourceTag, e.buckets[promNumBodyBounds])
pf("%s_sum{host=%q,source_tag=%q} %d\n",
metric, k.Host, k.SourceTag, e.sum)
}
func writeTimeHistogramHS(bw *bufio.Writer, metric string, k hostSourceKey, e promTimeEntry) {
pf := func(format string, a ...any) { _, _ = fmt.Fprintf(bw, format, a...) }
for i, bound := range promTimeBounds {
pf("%s_bucket{host=%q,source_tag=%q,le=%q} %d\n",
metric, k.Host, k.SourceTag, formatFloat(bound), e.buckets[i])
}
pf("%s_bucket{host=%q,source_tag=%q,le=\"+Inf\"} %d\n",
metric, k.Host, k.SourceTag, e.buckets[promNumTimeBounds])
pf("%s_count{host=%q,source_tag=%q} %d\n",
metric, k.Host, k.SourceTag, e.buckets[promNumTimeBounds])
pf("%s_sum{host=%q,source_tag=%q} %g\n",
metric, k.Host, k.SourceTag, e.sum)
}
// formatFloat renders a float64 bucket bound without trailing zeros but always
@@ -280,7 +397,7 @@ func writeBodyHistogram(bw *bufio.Writer, metric, labelName, labelValue string,
func formatFloat(f float64) string {
s := fmt.Sprintf("%g", f)
if !strings.Contains(s, ".") && !strings.Contains(s, "e") {
s += ".0" // ensure it looks like a float, not an integer
s += ".0"
}
return s
}
+122 -32
View File
@@ -6,22 +6,22 @@ import (
"testing"
)
func TestPromStoreIngestBodyBuckets(t *testing.T) {
func TestPromStoreIngestBytesBuckets(t *testing.T) {
ps := NewPromStore()
// 512 bytes: > 256, ≤ 1024 → bucket[0] stays 0, buckets[1..N] get 1
ps.Ingest(LogRecord{Website: "example.com", Method: "GET", Status: "200", BodyBytesSent: 512})
ps.Ingest(LogRecord{Website: "example.com", Method: "GET", Status: "200", BytesSent: 512, SourceTag: "direct"})
ps.mu.Lock()
be := ps.body["example.com"]
be := ps.bytesSent[hostSourceKey{"example.com", "direct"}]
ps.mu.Unlock()
if be == nil {
t.Fatal("expected body entry, got nil")
t.Fatal("expected bytes entry, got nil")
}
if be.buckets[0] != 0 { // le=256: 512 > 256
if be.buckets[0] != 0 {
t.Errorf("le=256 bucket = %d, want 0", be.buckets[0])
}
if be.buckets[1] != 1 { // le=1024: 512 ≤ 1024
if be.buckets[1] != 1 {
t.Errorf("le=1024 bucket = %d, want 1", be.buckets[1])
}
for i := 2; i <= promNumBodyBounds; i++ {
@@ -37,24 +37,21 @@ func TestPromStoreIngestBodyBuckets(t *testing.T) {
func TestPromStoreIngestTimeBuckets(t *testing.T) {
ps := NewPromStore()
// 0.075s: > 0.05, ≤ 0.1
ps.Ingest(LogRecord{Website: "example.com", Method: "GET", Status: "200", RequestTime: 0.075})
ps.Ingest(LogRecord{Website: "example.com", Method: "GET", Status: "200", RequestTime: 0.075, SourceTag: "direct"})
ps.mu.Lock()
te := ps.reqTime["example.com"]
te := ps.requestDuration[hostSourceKey{"example.com", "direct"}]
ps.mu.Unlock()
if te == nil {
t.Fatal("expected time entry, got nil")
t.Fatal("expected request_duration entry, got nil")
}
// le=0.05 (index 3): 0.075 > 0.05 → 0
if te.buckets[3] != 0 {
t.Errorf("le=0.05 bucket = %d, want 0", te.buckets[3])
}
// le=0.1 (index 4): 0.075 ≤ 0.1 → 1
if te.buckets[4] != 1 {
t.Errorf("le=0.1 bucket = %d, want 1", te.buckets[4])
}
// +Inf (last): always 1
if te.buckets[promNumTimeBounds] != 1 {
t.Errorf("+Inf bucket = %d, want 1", te.buckets[promNumTimeBounds])
}
@@ -83,7 +80,7 @@ func TestPromStoreServeHTTP(t *testing.T) {
ps := NewPromStore()
ps.Ingest(LogRecord{
Website: "example.com", Method: "GET", Status: "200",
BodyBytesSent: 100, RequestTime: 0.042,
BytesSent: 100, RequestTime: 0.042, SourceTag: "direct",
})
req := httptest.NewRequest("GET", "/metrics", nil)
@@ -95,13 +92,15 @@ func TestPromStoreServeHTTP(t *testing.T) {
checks := []string{
"# TYPE nginx_http_requests_total counter",
`nginx_http_requests_total{host="example.com",method="GET",status="200"} 1`,
"# TYPE nginx_http_response_body_bytes histogram",
`nginx_http_response_body_bytes_bucket{host="example.com",le="256"} 1`, // 100 ≤ 256
`nginx_http_response_body_bytes_count{host="example.com"} 1`,
`nginx_http_response_body_bytes_sum{host="example.com"} 100`,
"# TYPE nginx_http_bytes_sent histogram",
`nginx_http_bytes_sent_bucket{host="example.com",source_tag="direct",le="256"} 1`,
`nginx_http_bytes_sent_count{host="example.com",source_tag="direct"} 1`,
`nginx_http_bytes_sent_sum{host="example.com",source_tag="direct"} 100`,
"# TYPE nginx_http_request_duration_seconds histogram",
`nginx_http_request_duration_seconds_bucket{host="example.com",le="0.05"} 1`, // 0.042 ≤ 0.05
`nginx_http_request_duration_seconds_count{host="example.com"} 1`,
`nginx_http_request_duration_seconds_bucket{host="example.com",source_tag="direct",le="0.05"} 1`,
`nginx_http_request_duration_seconds_count{host="example.com",source_tag="direct"} 1`,
"# TYPE nginx_http_requests_by_source_total counter",
`nginx_http_requests_by_source_total{source_tag="direct",status_class="2xx"} 1`,
}
for _, want := range checks {
if !strings.Contains(body, want) {
@@ -110,12 +109,12 @@ func TestPromStoreServeHTTP(t *testing.T) {
}
}
func TestPromStoreSourceTagRollup(t *testing.T) {
func TestPromStoreSourceTagAndStatusClass(t *testing.T) {
ps := NewPromStore()
// same host, two tags; each tag should appear with its own series.
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 100, SourceTag: "direct"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 300, SourceTag: "cdn"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 100, SourceTag: "cdn"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BytesSent: 100, SourceTag: "direct"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BytesSent: 300, SourceTag: "cdn"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "404", BytesSent: 100, SourceTag: "cdn"})
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "500", BytesSent: 50, SourceTag: "cdn"})
req := httptest.NewRequest("GET", "/metrics", nil)
rec := httptest.NewRecorder()
@@ -124,13 +123,17 @@ func TestPromStoreSourceTagRollup(t *testing.T) {
checks := []string{
"# TYPE nginx_http_requests_by_source_total counter",
`nginx_http_requests_by_source_total{source_tag="direct"} 1`,
`nginx_http_requests_by_source_total{source_tag="cdn"} 2`,
"# TYPE nginx_http_response_body_bytes_by_source histogram",
`nginx_http_response_body_bytes_by_source_sum{source_tag="direct"} 100`,
`nginx_http_response_body_bytes_by_source_sum{source_tag="cdn"} 400`,
// host-series totals are unchanged (one row, counting 3 requests).
`nginx_http_requests_total{host="h",method="GET",status="200"} 3`,
`nginx_http_requests_by_source_total{source_tag="direct",status_class="2xx"} 1`,
`nginx_http_requests_by_source_total{source_tag="cdn",status_class="2xx"} 1`,
`nginx_http_requests_by_source_total{source_tag="cdn",status_class="4xx"} 1`,
`nginx_http_requests_by_source_total{source_tag="cdn",status_class="5xx"} 1`,
// dual-labeled histogram subsumes the old by-source bytes metric
`nginx_http_bytes_sent_sum{host="h",source_tag="direct"} 100`,
`nginx_http_bytes_sent_sum{host="h",source_tag="cdn"} 450`,
// host counter unchanged
`nginx_http_requests_total{host="h",method="GET",status="200"} 2`,
`nginx_http_requests_total{host="h",method="GET",status="404"} 1`,
`nginx_http_requests_total{host="h",method="GET",status="500"} 1`,
}
for _, want := range checks {
if !strings.Contains(body, want) {
@@ -139,6 +142,79 @@ func TestPromStoreSourceTagRollup(t *testing.T) {
}
}
func TestPromStoreUpstreamMetrics(t *testing.T) {
ps := NewPromStore()
// One upstream-served 200, one upstream-served 502, one no-upstream 200.
ps.Ingest(LogRecord{
Website: "h", Method: "GET", Status: "200", BytesSent: 100, SourceTag: "cdn",
RequestLength: 500,
HasUpstream: true,
UpstreamResponseTime: 0.020,
UpstreamStatus: "200",
})
ps.Ingest(LogRecord{
Website: "h", Method: "GET", Status: "502", BytesSent: 100, SourceTag: "cdn",
RequestLength: 500,
HasUpstream: true,
UpstreamResponseTime: 0.150,
UpstreamStatus: "502",
})
ps.Ingest(LogRecord{
Website: "h", Method: "GET", Status: "200", BytesSent: 100, SourceTag: "direct",
RequestLength: 200,
})
req := httptest.NewRequest("GET", "/metrics", nil)
rec := httptest.NewRecorder()
ps.ServeHTTP(rec, req)
body := rec.Body.String()
checks := []string{
// upstream counter: only the two upstream-served requests
`nginx_http_upstream_requests_total{host="h",source_tag="cdn",status_class="2xx"} 1`,
`nginx_http_upstream_requests_total{host="h",source_tag="cdn",status_class="5xx"} 1`,
// upstream duration histogram only for upstream-served requests
`nginx_http_upstream_duration_seconds_count{host="h",source_tag="cdn"} 2`,
// request_bytes only observed when RequestLength > 0 — all three here
`nginx_http_request_bytes_count{host="h",source_tag="cdn"} 2`,
`nginx_http_request_bytes_count{host="h",source_tag="direct"} 1`,
}
for _, want := range checks {
if !strings.Contains(body, want) {
t.Errorf("missing %q in output:\n%s", want, body)
}
}
// no-upstream request must not bump upstream counters
if strings.Contains(body, `source_tag="direct",status_class=`) &&
strings.Contains(body, "nginx_http_upstream_requests_total") {
// Better check: ensure no direct-tagged upstream entry
for _, bad := range []string{
`nginx_http_upstream_requests_total{host="h",source_tag="direct"`,
`nginx_http_upstream_duration_seconds_bucket{host="h",source_tag="direct"`,
} {
if strings.Contains(body, bad) {
t.Errorf("unexpected %q in output:\n%s", bad, body)
}
}
}
}
func TestPromStoreRequestBytesSkippedWhenZero(t *testing.T) {
ps := NewPromStore()
// v1 record — RequestLength=0, so request_bytes histogram should be empty.
ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BytesSent: 100, SourceTag: "cdn"})
req := httptest.NewRequest("GET", "/metrics", nil)
rec := httptest.NewRecorder()
ps.ServeHTTP(rec, req)
body := rec.Body.String()
if strings.Contains(body, "nginx_http_request_bytes_bucket{") {
t.Errorf("expected no request_bytes series for v1 record:\n%s", body)
}
}
func TestPromStoreUDPCounters(t *testing.T) {
ps := NewPromStore()
ps.IncUDPPacket()
@@ -167,7 +243,6 @@ func TestPromStoreUDPCounters(t *testing.T) {
func TestPromStoreCounterCap(t *testing.T) {
ps := NewPromStore()
// Fill to cap with distinct {host,method,status} combos
for i := 0; i < promCounterCap+10; i++ {
host := strings.Repeat("x", i%10+1) + ".com"
status := "200"
@@ -183,3 +258,18 @@ func TestPromStoreCounterCap(t *testing.T) {
t.Errorf("counter map size %d exceeds cap %d", n, promCounterCap)
}
}
func TestStatusClass(t *testing.T) {
cases := map[string]string{
"200": "2xx", "201": "2xx", "299": "2xx",
"301": "3xx",
"404": "4xx", "418": "4xx",
"500": "5xx", "504": "5xx",
"100": "other", "": "other", "abc": "other",
}
for in, want := range cases {
if got := statusClass(in); got != want {
t.Errorf("statusClass(%q) = %q, want %q", in, got, want)
}
}
}
+13 -4
View File
@@ -32,9 +32,18 @@ func BenchmarkIngest(b *testing.B) {
}
}
// BenchmarkParseLine measures parser throughput.
// BenchmarkParseLine measures parser throughput on a v1 line.
func BenchmarkParseLine(b *testing.B) {
line := "www.example.com\t1.2.3.4\t1741954800.123\tGET\t/api/v1/search?q=foo\t200\t1452\t0.043"
line := "v1\twww.example.com\t1.2.3.4\tGET\t/api/v1/search?q=foo\t200\t1452\t0.043\t0\t12345\tcdn\t10.0.0.1\thttps"
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLine(line, 24, 48)
}
}
// BenchmarkParseLineV2 measures parser throughput on a v2 line with upstream.
func BenchmarkParseLineV2(b *testing.B) {
line := "v2\twww.example.com\t1.2.3.4\tGET\t/api/v1/search?q=foo\t200\t1500\t421\t0.043\t0.012\t200\t0\t12345\tcdn\t10.0.0.1\thttps"
b.ResetTimer()
for i := 0; i < b.N; i++ {
ParseLine(line, 24, 48)
@@ -118,7 +127,7 @@ func TestGRPCEndToEnd(t *testing.T) {
}
grpcSrv := grpc.NewServer()
pb.RegisterLogtailServiceServer(grpcSrv, NewServer(store, "e2e-test"))
go grpcSrv.Serve(lis)
go func() { _ = grpcSrv.Serve(lis) }()
defer grpcSrv.Stop()
// Dial it
@@ -127,7 +136,7 @@ func TestGRPCEndToEnd(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer conn.Close()
defer func() { _ = conn.Close() }()
client := pb.NewLogtailServiceClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+6 -6
View File
@@ -52,7 +52,7 @@ func (mt *MultiTailer) Run(ctx context.Context) {
if err != nil {
log.Fatalf("tailer: failed to create watcher: %v", err)
}
defer watcher.Close()
defer func() { _ = watcher.Close() }()
files := make(map[string]*fileState)
retrying := make(map[string]struct{}) // paths currently in a retryOpen goroutine
@@ -85,7 +85,7 @@ func (mt *MultiTailer) Run(ctx context.Context) {
select {
case <-ctx.Done():
for _, fs := range files {
fs.f.Close()
_ = fs.f.Close()
}
return
@@ -117,7 +117,7 @@ func (mt *MultiTailer) Run(ctx context.Context) {
if event.Has(fsnotify.Rename) || event.Has(fsnotify.Remove) {
// Drain remaining bytes in the old fd before it disappears.
mt.readLines(fs.reader)
fs.f.Close()
_ = fs.f.Close()
delete(files, event.Name)
_ = watcher.Remove(event.Name)
startRetry(event.Name)
@@ -167,7 +167,7 @@ func (mt *MultiTailer) rescan(
for path, fs := range files {
if _, matched := current[path]; !matched {
mt.readLines(fs.reader)
fs.f.Close()
_ = fs.f.Close()
_ = watcher.Remove(path)
delete(files, path)
log.Printf("tailer: retired %s (no longer matched by any pattern)", path)
@@ -182,11 +182,11 @@ func openAndSeekEOF(path string, watcher *fsnotify.Watcher) (*fileState, error)
return nil, err
}
if _, err := f.Seek(0, io.SeekEnd); err != nil {
f.Close()
_ = f.Close()
return nil, err
}
if err := watcher.Add(path); err != nil {
f.Close()
_ = f.Close()
return nil, err
}
return &fileState{f: f, reader: bufio.NewReader(f)}, nil
+8 -7
View File
@@ -11,7 +11,8 @@ import (
func writeLine(t *testing.T, f *os.File, website string) {
t.Helper()
_, err := fmt.Fprintf(f, "%s\t1.2.3.4\t0\tGET\t/path\t200\t0\t0.001\n", website)
// v1 layout: 12 payload fields after the v1 prefix.
_, err := fmt.Fprintf(f, "v1\t%s\t1.2.3.4\tGET\t/path\t200\t0\t0.001\t0\t0\tdirect\t10.0.0.1\thttps\n", website)
if err != nil {
t.Fatalf("writeLine: %v", err)
}
@@ -25,7 +26,7 @@ func TestMultiTailerReadsLines(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer func() { _ = f.Close() }()
ch := make(chan LogRecord, 100)
mt := NewMultiTailer([]string{path}, time.Hour, 24, 48, ch)
@@ -62,7 +63,7 @@ func TestMultiTailerMultipleFiles(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer func() { _ = f.Close() }()
files[i] = f
}
@@ -108,7 +109,7 @@ func TestMultiTailerLogRotation(t *testing.T) {
// Simulate logrotate: rename the old file, create a new one
rotated := filepath.Join(dir, "access.log.1")
f.Close()
_ = f.Close()
if err := os.Rename(path, rotated); err != nil {
t.Fatal(err)
}
@@ -121,7 +122,7 @@ func TestMultiTailerLogRotation(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer newF.Close()
defer func() { _ = newF.Close() }()
// Allow retry goroutine to pick it up
time.Sleep(300 * time.Millisecond)
@@ -137,7 +138,7 @@ func TestExpandGlobs(t *testing.T) {
dir := t.TempDir()
for _, name := range []string{"a.log", "b.log", "other.txt"} {
f, _ := os.Create(filepath.Join(dir, name))
f.Close()
_ = f.Close()
}
pattern := filepath.Join(dir, "*.log")
@@ -151,7 +152,7 @@ func TestExpandGlobsDeduplication(t *testing.T) {
dir := t.TempDir()
p := filepath.Join(dir, "access.log")
f, _ := os.Create(p)
f.Close()
_ = f.Close()
// Same file listed twice via explicit path and glob
paths := expandGlobs([]string{p, filepath.Join(dir, "*.log")})
+4 -4
View File
@@ -16,7 +16,7 @@ const udpReadBufBytes = 4 << 20
const udpPacketBuf = 64 << 10
// UDPListener receives nginx_ipng_stats_logtail datagrams on a local socket,
// parses each packet as one log line, and forwards LogRecords to ch.
// parses each line through the versioned ParseLine, and forwards LogRecords to ch.
type UDPListener struct {
addr string
v4bits int
@@ -50,7 +50,7 @@ func (u *UDPListener) Run(ctx context.Context) {
if err != nil {
log.Fatalf("udp: listen %s: %v", u.addr, err)
}
defer conn.Close()
defer func() { _ = conn.Close() }()
if err := conn.SetReadBuffer(udpReadBufBytes); err != nil {
log.Printf("udp: SetReadBuffer(%d): %v", udpReadBufBytes, err)
}
@@ -58,7 +58,7 @@ func (u *UDPListener) Run(ctx context.Context) {
go func() {
<-ctx.Done()
conn.Close()
_ = conn.Close()
}()
buf := make([]byte, udpPacketBuf)
@@ -84,7 +84,7 @@ func (u *UDPListener) Run(ctx context.Context) {
if line == "" {
continue
}
rec, ok := ParseUDPLine(line, u.v4bits, u.v6bits)
rec, ok := ParseLine(line, u.v4bits, u.v6bits)
if !ok {
continue
}
+8 -8
View File
@@ -17,7 +17,7 @@ func TestUDPListenerRoundTrip(t *testing.T) {
t.Fatalf("listen probe: %v", err)
}
addr := pc.LocalAddr().String()
pc.Close() // release; listener will re-bind
_ = pc.Close() // release; listener will re-bind
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -31,15 +31,15 @@ func TestUDPListenerRoundTrip(t *testing.T) {
if err != nil {
t.Fatalf("dial: %v", err)
}
defer conn.Close()
defer func() { _ = conn.Close() }()
// The listener is started asynchronously; retry for up to 1s.
good := "v1\twww.example.com\t1.2.3.4\tGET\t/\t200\t42\t0.010\t0\t12345\tdirect\t10.0.0.1\thttps"
bad := "not enough\tfields"
deadline := time.Now().Add(time.Second)
for time.Now().Before(deadline) {
conn.Write([]byte(good))
conn.Write([]byte(bad))
_, _ = conn.Write([]byte(good))
_, _ = conn.Write([]byte(bad))
select {
case rec := <-ch:
if rec.Website != "www.example.com" || rec.SourceTag != "direct" {
@@ -80,7 +80,7 @@ func TestUDPListenerBatchedDatagram(t *testing.T) {
t.Fatalf("listen probe: %v", err)
}
addr := pc.LocalAddr().(*net.UDPAddr)
pc.Close()
_ = pc.Close()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -100,7 +100,7 @@ func TestUDPListenerBatchedDatagram(t *testing.T) {
if err != nil {
t.Fatalf("src listen: %v", err)
}
defer src.Close()
defer func() { _ = src.Close() }()
// Drive the listener with retries until all three records land.
got := make(map[string]bool)
@@ -148,7 +148,7 @@ func TestUDPListenerMultipleSources(t *testing.T) {
t.Fatalf("listen probe: %v", err)
}
addr := pc.LocalAddr().(*net.UDPAddr)
pc.Close()
_ = pc.Close()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
@@ -166,7 +166,7 @@ func TestUDPListenerMultipleSources(t *testing.T) {
if err != nil {
t.Fatalf("%s listen: %v", tag, err)
}
defer src.Close()
defer func() { _ = src.Close() }()
deadline := time.Now().Add(time.Second)
for time.Now().Before(deadline) {
if _, err := src.WriteTo(good, addr); err != nil {
+1 -1
View File
@@ -51,7 +51,7 @@ func applyTerm(term string, fs *filterState) error {
// Find the first operator character: ~, !, >, <, =
opIdx := strings.IndexAny(term, "~!><=")
if opIdx <= 0 {
return fmt.Errorf("invalid term %q: expected field=value, field>=value, field~=regex, etc.", term)
return fmt.Errorf("invalid term %q: expected field=value, field>=value, field~=regex, etc", term)
}
field := strings.ToLower(strings.TrimSpace(term[:opIdx]))
+2 -2
View File
@@ -52,7 +52,7 @@ func startFake(t *testing.T, fs *fakeServer) string {
}
srv := grpc.NewServer()
pb.RegisterLogtailServiceServer(srv, fs)
go srv.Serve(lis)
go func() { _ = srv.Serve(lis) }()
t.Cleanup(srv.GracefulStop)
return lis.Addr().String()
}
@@ -541,7 +541,7 @@ func TestDialFake(t *testing.T) {
if err != nil {
t.Fatal(err)
}
defer conn.Close()
defer func() { _ = conn.Close() }()
_ = client
// If we got here without error, the fake server is reachable.
+3 -3
View File
@@ -542,7 +542,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
fmt.Sprintf("cannot connect to %s: %v", params.Target, err)))
return
}
defer conn.Close()
defer func() { _ = conn.Close() }()
ctx, cancel := context.WithTimeout(r.Context(), 5*time.Second)
defer cancel()
@@ -589,7 +589,7 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}
resp, err := ltClient.ListTargets(ctx, &pb.ListTargetsRequest{})
if ltConn != nil {
ltConn.Close()
_ = ltConn.Close()
}
if err != nil {
ltCh <- nil
@@ -683,5 +683,5 @@ func writeRawJSON(w http.ResponseWriter, params QueryParams, resp *pb.TopNRespon
o.Entries[i] = entry{Label: e.Label, Count: e.Count}
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(o)
_ = json.NewEncoder(w).Encode(o)
}
+1 -1
View File
@@ -59,7 +59,7 @@ func main() {
<-ctx.Done()
log.Printf("frontend: shutting down")
srv.Shutdown(context.Background())
_ = srv.Shutdown(context.Background())
}
func envOr(key, def string) string {
+105 -53
View File
@@ -64,8 +64,9 @@ natively, so operators can run it either from on-disk log files, from the UDP fe
### Non-Goals
- The system does **not** parse arbitrary nginx `log_format` strings. Two fixed tab-separated formats are supported: a file format and
a UDP format (see FR-2). Operators who need general parsing should use Vector, Fluent Bit, or Promtail.
- The system does **not** parse arbitrary nginx `log_format` strings. A single versioned tab-separated format is
supported on both file and UDP ingest (see FR-2). Operators who need general parsing should use Vector, Fluent Bit, or
Promtail.
- The system does **not** store raw log lines. Counts are aggregated at ingest; the original log lines are not kept in memory or on
disk. The project does not replace an access log.
- The system does **not** persist counters across restarts. Ring buffers are in-memory only. On aggregator restart, historical state
@@ -98,50 +99,92 @@ Each requirement carries a unique identifier (`FR-X.Y` or `NFR-X.Y`) so that lat
working set.
- **FR-1.5** `http_response` MUST be the HTTP status code as recorded by nginx.
- **FR-1.6** `is_tor` MUST be a boolean, populated by the operator in the log format (typically via a lookup against a TOR exit-node
list). For the file format, lines without this field default to `false` for backward compatibility.
- **FR-1.7** `asn` MUST be an int32 decimal value sourced from MaxMind GeoIP2 (or equivalent). For the file format, lines without
this field default to `0`.
- **FR-1.8** `ipng_source_tag` MUST be a short string identifying which attribution tag the request arrived under. For records from
on-disk log files, the collector MUST assign the tag `"direct"` (mirroring `nginx-ipng-stats-plugin`'s default-source convention). For
records from the UDP stream, the tag is taken from the log line as emitted by the plugin.
list). Operators without TOR data MUST emit literal `0`.
- **FR-1.7** `asn` MUST be an int32 decimal value sourced from MaxMind GeoIP2 (or equivalent). Operators without GeoIP data MUST
emit literal `0`.
- **FR-1.8** `ipng_source_tag` MUST be a short string identifying which attribution tag the request arrived under. The tag is
always taken verbatim from the log line; the collector does NOT synthesise a fallback. Operators not running
`nginx-ipng-stats-plugin` MUST emit a literal value (typically `"direct"`).
**FR-2 Log formats**
- **FR-2.1 File format.** The collector MUST accept nginx access logs in the following tab-separated layout, with the last two fields
(`is_tor`, `asn`) optional for backward compatibility:
- **FR-2.1 Versioned dispatch.** Both the file tailer and the UDP listener MUST funnel every input line through a single
parser that switches on a leading `v<N>\t` version tag. Lines without a recognised tag — including the legacy
positional file format — MUST be rejected and counted as parse failures. Two versions are defined: `v1` (FR-2.2) and
`v2` (FR-2.3). Both ingest paths accept both versions; downstream processing is identical regardless of which path the
line came in over. `$server_addr` and `$scheme` are parsed but discarded — they are reserved for future use.
- **FR-2.2 v1 format.** The v1 payload MUST be exactly 12 tab-separated fields after the `v1` tag (13 fields total).
```nginx
log_format logtail '$host\t$remote_addr\t$msec\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn';
log_format ipng_stats_logtail
'v1\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t'
'$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
```
| # | Field | Ingested into |
|---|-------------------|----------------------------|
| 0 | `$host` | `website` |
| 1 | `$remote_addr` | `client_prefix` (truncated)|
| 2 | `$msec` | (discarded) |
|---|-------------------|-------------------------------------|
| 0 | `v1` | version tag |
| 1 | `$host` | `website` |
| 2 | `$remote_addr` | `client_prefix` (truncated) |
| 3 | `$request_method` | Prom `method` label |
| 4 | `$request_uri` | `http_request_uri` |
| 4 | `$request_uri` | `http_request_uri` (query stripped) |
| 5 | `$status` | `http_response` |
| 6 | `$body_bytes_sent`| Prom body histogram |
| 7 | `$request_time` | Prom duration histogram |
| 8 | `$is_tor` | `is_tor` (optional) |
| 9 | `$asn` | `asn` (optional) |
| 6 | `$body_bytes_sent`| Prom `nginx_http_bytes_sent` |
| 7 | `$request_time` | Prom `nginx_http_request_duration_seconds` |
| 8 | `$is_tor` | `is_tor` |
| 9 | `$asn` | `asn` |
| 10| `$ipng_source_tag`| `source_tag` |
| 11| `$server_addr` | *(parsed and discarded)* |
| 12| `$scheme` | *(parsed and discarded)* |
- **FR-2.2 UDP format.** The collector MUST accept datagrams in a versioned tab-separated layout, as emitted by
`nginx-ipng-stats-plugin`'s `ipng_stats_logtail` directive. Every datagram MUST begin with a literal version tag
(`v<N>\t`) so the collector can route each packet to the appropriate parser. Only `v1` is defined in this revision;
unknown versions MUST be counted as parse failures and dropped.
- **FR-2.3 v2 format.** The v2 payload MUST be exactly 15 tab-separated fields after the `v2` tag (16 fields total).
v2 replaces `$body_bytes_sent` with `$bytes_sent` (full wire bytes including headers) and adds four operationally
important fields: `$request_length` (request size including headers), `$upstream_response_time`, `$upstream_status`,
and the existing v1 fields rearranged for clarity.
```nginx
log_format ipng_stats_logtail 'v1\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
log_format ipng_stats_logtail
'v2\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t'
'$bytes_sent\t$request_length\t$request_time\t$upstream_response_time\t$upstream_status\t'
'$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
```
The v1 payload MUST have exactly 12 tab-separated fields after the `v1` tag (13 fields total). `$server_addr` and
`$scheme` MUST be parsed but dropped; they are reserved for future use. Malformed datagrams (wrong version, wrong
field count, bad IP) MUST be counted (FR-8.5) and silently dropped.
| # | Field | Ingested into |
|---|---------------------------|----------------------------------------------|
| 0 | `v2` | version tag |
| 1 | `$host` | `website` |
| 2 | `$remote_addr` | `client_prefix` (truncated) |
| 3 | `$request_method` | Prom `method` label |
| 4 | `$request_uri` | `http_request_uri` (query stripped) |
| 5 | `$status` | `http_response` |
| 6 | `$bytes_sent` | Prom `nginx_http_bytes_sent` |
| 7 | `$request_length` | Prom `nginx_http_request_bytes` (v2-only) |
| 8 | `$request_time` | Prom `nginx_http_request_duration_seconds` |
| 9 | `$upstream_response_time` | Prom `nginx_http_upstream_duration_seconds` (v2-only) |
| 10| `$upstream_status` | Prom `nginx_http_upstream_requests_total` (v2-only) |
| 11| `$is_tor` | `is_tor` |
| 12| `$asn` | `asn` |
| 13| `$ipng_source_tag` | `source_tag` |
| 14| `$server_addr` | *(parsed and discarded)* |
| 15| `$scheme` | *(parsed and discarded)* |
- **FR-2.3** The file tailer MUST set `source_tag="direct"` on every record it parses. The UDP listener MUST propagate
`$ipng_source_tag` verbatim. This is the only difference in downstream processing between the two ingest paths.
When nginx serves the response without an upstream (static files, redirects, errors), nginx emits literal `-` for
`$upstream_response_time` and `$upstream_status`. The parser MUST treat that as "no upstream", skip the upstream
histograms, and not increment the upstream counter. When nginx retries across multiple upstreams, both fields are
comma-separated; the parser MUST keep the last entry, since that is the upstream that ultimately served the response.
- **FR-2.4 Semantic shift on v2 rollout.** v1 fills `nginx_http_bytes_sent` from `$body_bytes_sent`; v2 fills it from
`$bytes_sent`. Operators MUST expect a small step up in the metric when emitters move from v1 to v2 (header overhead;
typically a few hundred bytes per response).
- **FR-2.5 Malformed input.** Lines with an unknown version, the wrong field count for the claimed version, or an
unparsable IP MUST be silently dropped. UDP drops MUST be counted via FR-8.6; file-path drops are implicit (the tailer
falls behind the file).
- **FR-2.6 Unknown `$is_tor` / `$asn`.** Operators without TOR or GeoIP data MUST emit literal `0` for both fields. A
literal `0` in `$is_tor` parses as `false`; a literal `0` in `$asn` parses as ASN `0`, filterable at query time with
`--asn '!=0'`.
**FR-3 Ring buffers and time windows**
@@ -242,15 +285,21 @@ Each requirement carries a unique identifier (`FR-X.Y` or `NFR-X.Y`) so that lat
- **FR-8.2** The collector MUST expose a per-request counter `nginx_http_requests_total{host, method, status}` capped at
`promCounterCap = 250 000` distinct label sets. When the cap is reached, further new label sets MUST be dropped (existing series
keep incrementing) until the map is rolled over.
- **FR-8.3** The collector MUST expose per-host histograms
`nginx_http_response_body_bytes{host, le}` (body-size distribution) and
`nginx_http_request_duration_seconds{host, le}` (request-time distribution). The duration histogram MUST NOT be split by
`source_tag` — its bucket count would multiply without operational benefit.
- **FR-8.4** The collector MUST expose two parallel roll-ups labeled by `source_tag` only (not cross-producted with host):
`nginx_http_requests_by_source_total{source_tag}` and
`nginx_http_response_body_bytes_by_source{source_tag, le}`. These are separate metric names to avoid inconsistent label sets
under a single name.
- **FR-8.5** The collector MUST expose three counters that let operators distinguish UDP parse failures from back-pressure drops:
- **FR-8.3** The collector MUST expose two histograms keyed by `{host, source_tag}`:
`nginx_http_bytes_sent{host, source_tag, le}` (response wire-bytes distribution; FR-2.4) and
`nginx_http_request_duration_seconds{host, source_tag, le}` (end-to-end request time distribution).
Cardinality is bounded by `host × source_tag × bucket_count`, which is small enough that no explicit cap is required.
- **FR-8.4** The collector MUST expose three v2-only metrics that are populated only when v2 records arrive (and, for the
upstream metrics, only when nginx involved an upstream):
`nginx_http_request_bytes{host, source_tag, le}` from `$request_length`,
`nginx_http_upstream_duration_seconds{host, source_tag, le}` from `$upstream_response_time`, and
`nginx_http_upstream_requests_total{host, source_tag, status_class}` from `$upstream_status`. `status_class` is the
HTTP class of the upstream's status code, folded to `2xx`/`3xx`/`4xx`/`5xx`/`other`.
- **FR-8.5** The collector MUST expose a source-tag rollup counter
`nginx_http_requests_by_source_total{source_tag, status_class}`. `status_class` is the HTTP class of `$status`, folded
the same way as in FR-8.4. This rollup is intentionally not cross-producted with `host` — its purpose is fleet-wide
source-attribution health, not per-host detail.
- **FR-8.6** The collector MUST expose three counters that let operators distinguish UDP parse failures from back-pressure drops:
`logtail_udp_packets_received_total` (datagrams off the socket, one increment per `recvfrom`),
`logtail_udp_loglines_success_total` (log lines that parsed OK, incremented once per log line — a single batched datagram from
the nginx plugin may contribute many), and
@@ -279,13 +328,13 @@ Each requirement carries a unique identifier (`FR-X.Y` or `NFR-X.Y`) so that lat
for a collector is therefore approximately 845 MB (live map ~19 MB + fine ring ~558 MB + coarse ring ~268 MB).
- **NFR-2.3** The aggregator MUST apply the same tier caps as the collector. Its steady-state memory is roughly equivalent to one
collector regardless of the number of collectors subscribed.
- **NFR-2.4** The Prometheus counter map (FR-8.2) MUST be capped at `promCounterCap = 250 000` entries. The per-host and per-source
histograms MUST NOT be capped explicitly — they grow only with the distinct host count, which is bounded by the operator's vhost
configuration.
- **NFR-2.4** The Prometheus counter map (FR-8.2) MUST be capped at `promCounterCap = 250 000` entries. The dual-labeled
`{host, source_tag}` histograms MUST NOT be capped explicitly — they grow only with the cross-product of distinct
hosts and distinct source tags, both bounded by the operator's nginx configuration.
**NFR-3 Performance**
- **NFR-3.1** `ParseLine` and `ParseUDPLine` MUST use `strings.Split` / `strings.SplitN` (no regex), so that per-line cost stays
- **NFR-3.1** `ParseLine` MUST use `strings.Split` / `strings.IndexByte` (no regex), so that per-line cost stays
around 50 ns on commodity hardware.
- **NFR-3.2** `TopN` and `Trend` queries across the full 24-hour coarse ring MUST complete in well under 250 ms at the 50 000-entry
fine cap, for fully-specified filters.
@@ -417,8 +466,10 @@ connected.
#### Key data types
- `LogRecord`ten fields (website, client_prefix, URI, status, is_tor, asn, method, body_bytes_sent, request_time, source_tag).
Produced by `ParseLine` or `ParseUDPLine` and consumed by the store goroutine.
- `LogRecord`fourteen fields (website, client_prefix, URI, status, is_tor, asn, method, bytes_sent, request_length,
request_time, upstream_response_time, upstream_status, has_upstream, source_tag). Produced by `ParseLine` (which
dispatches on the `v<N>\t` prefix) and consumed by the store goroutine. v1 records leave the v2-only fields
(`request_length`, upstream_*) at zero / false.
- `Tuple6` (historical name; carries seven fields now) — the aggregation key. NUL-separated when encoded as a map key for snapshots.
The code name is intentionally stable so downstream tests and consumers are not churned.
- `Snapshot``(timestamp, []Entry)` where `Entry = (label, count)` and `label` is an encoded `Tuple6`.
@@ -559,9 +610,10 @@ transitions. No per-request logging.
- **Frontend crash.** Stateless. Operator restarts.
- **UDP datagram loss.** Any datagram dropped in-kernel (socket buffer full, network drop) does not register as a parse failure; it
is simply invisible. Operators should size `SO_RCVBUF` appropriately; the collector already requests 4 MiB.
- **Malformed log lines.** File format: lines with <8 tab-separated fields are silently skipped; an invalid IP also drops the line.
UDP: packets without a recognised `v<N>\t` prefix, or with the wrong field count for the claimed version, or with a bad IP, are
counted as received-but-not-success and dropped.
- **Malformed log lines.** Both ingest paths use the versioned `v<N>\t` parser (FR-2). Lines without a recognised version
prefix, with the wrong field count for the claimed version, or with a bad IP are silently dropped. UDP drops are
visible as `packets_received_total - loglines_success_total`; file-path drops are implicit (the tailer simply moves
past them).
- **Clock skew between collectors.** Trend sparklines derived from merged data assume collectors are roughly NTP-synced. Per-bucket
alignment is to the local minute / 5-minute boundary of each collector.
- **gRPC traffic over untrusted links.** The system does not ship TLS; operators should front the gRPC ports with a TLS-terminating
@@ -588,11 +640,11 @@ transitions. No per-request logging.
- **pull-based collector polling (aggregator polls collectors every second).** Rejected in favor of push. Polling multiplies query
latency and makes the aggregator's cache stale by the poll interval. Push-stream with delta merge keeps the cache within seconds
of real time.
- **One metric name for both per-host and per-source_tag roll-ups.** Rejected for Prometheus hygiene. Mixing different label sets
under one metric name breaks aggregation rules; separate metric names (`_by_source`) are clearer and easier to query.
- **Cross-product of `host × source_tag` for every counter and histogram.** Rejected. With ~20 tags and ~50 hosts the cardinality
explodes quickly on the duration histogram without operational benefit. The duration histogram stays per-host; requests and body
size get a parallel `_by_source` rollup.
- **Separate `_by_source` metric names with a single label.** The original v0.2 layout exposed `_by_source` siblings to
avoid mixing label sets under one metric name. Superseded by the v0.3 layout: histograms now carry both `host` and
`source_tag` directly, and the source-tag rollup counter gains a `status_class` label. Cardinality stays bounded
(~7 hosts × ~6 tags × 11 buckets ≈ 460 series per histogram), and Grafana queries become simpler (`sum by(source_tag)`
rather than picking a different metric name).
- **Writing every `snapshot` to disk for restart recovery.** Rejected in favor of `DumpSnapshots` RPC backfill. Disk-backed
persistence would multiply operational surface (rotation, fsck, permissions) for a feature that needs to survive only an
aggregator restart.
+109 -87
View File
@@ -131,101 +131,98 @@ or for temporary overrides, without editing the unit.
The file is **not a dpkg conffile**: postinst writes it only when absent, so operator edits
survive upgrades, and `dpkg --purge` removes it.
### nginx — file-based ingest
### nginx — log format
Add the `logtail` format and attach it to whichever `server` blocks you want tracked:
Both ingest paths (file and UDP) use the same versioned tab-separated format. Every line MUST
begin with a literal `v1\t` or `v2\t` prefix; lines without a recognised prefix are dropped.
Two versions are defined; you can mix them across a fleet during a rollout (the collector
parses both).
```nginx
http {
log_format logtail '$host\t$remote_addr\t$msec\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn';
#### v2 (recommended)
server {
access_log /var/log/nginx/access.log logtail;
# or per-vhost:
access_log /var/log/nginx/www.example.com.access.log logtail;
}
}
```
Tab-separated, fixed field order, ten fields. The precise layout:
| # | Field | Ingested into |
|---|-------------------|--------------------------|
| 0 | `$host` | `website` |
| 1 | `$remote_addr` | `client_prefix` (truncated) |
| 2 | `$msec` | *(discarded)* |
| 3 | `$request_method` | Prom `method` label |
| 4 | `$request_uri` | `http_request_uri` (query stripped) |
| 5 | `$status` | `http_response` |
| 6 | `$body_bytes_sent`| Prom body histogram |
| 7 | `$request_time` | Prom duration histogram |
| 8 | `$is_tor` | `is_tor` (optional) |
| 9 | `$asn` | `asn` (optional) |
`$is_tor` is `1` if the client IP is a TOR exit node and `0` otherwise (typically populated
via a Lua script or `$geoip2_data_*`). `$asn` is the client AS number as a decimal integer
(e.g. MaxMind GeoIP2's `$geoip2_data_autonomous_system_number`).
**If either is unknown, emit `0`.** A literal `0` in `$is_tor` parses as `false`; a literal
`0` in `$asn` parses as ASN `0`, which you can exclude at query time with `--asn '!=0'` / the
`asn!=0` filter expression. Operators who don't have TOR or GeoIP data can simply emit `0` for
both columns and everything works.
Both fields are also **positionally optional** for backward compatibility — older 8-field
lines are accepted and default to `false` / `0`. Records from the file tailer are always
tagged `source_tag="direct"`.
Then point the collector at the log files via `COLLECTOR_LOGS` — comma-separated paths or
glob patterns. Make sure the files are group-readable by `www-data` (the collector's primary
group in the systemd unit).
### nginx — UDP ingest (`nginx-ipng-stats-plugin`)
If the nginx host runs [`nginx-ipng-stats-plugin`](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin),
the plugin's `ipng_stats_logtail` directive emits one UDP datagram per request directly to
the collector, no log file involved. The wire format is **versioned** — every datagram starts
with a literal `v1\t` prefix so the collector can ship new parser versions (v2, v3, …) before
emitters are upgraded and route each packet accordingly.
v2 carries five operationally important fields v1 lacks: `$bytes_sent` (full wire bytes,
replaces `$body_bytes_sent`), `$request_length` (request size including headers),
`$upstream_response_time`, and `$upstream_status`. Together they let dashboards split
end-to-end latency into upstream vs. nginx overhead, attribute errors to the upstream vs. the
edge, and report ingress bandwidth.
```nginx
http {
log_format ipng_stats_logtail
'v1\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
'v2\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t'
'$bytes_sent\t$request_length\t$request_time\t$upstream_response_time\t$upstream_status\t'
'$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
# File ingest:
server {
access_log /var/log/nginx/access.log ipng_stats_logtail;
}
# UDP ingest (nginx-ipng-stats-plugin):
ipng_stats_logtail ipng_stats_logtail udp://127.0.0.1:9514 buffer=64k flush=1s;
}
```
Precise v1 layout — 13 tab-separated fields total (version prefix + 12 payload fields):
| # | Field | Ingested into |
|---|-------------------|------------------------------|
| 0 | `v1` | version tag |
|---|---------------------------|---------------------------------------------------|
| 0 | `v2` | version tag |
| 1 | `$host` | `website` |
| 2 | `$remote_addr` | `client_prefix` (truncated) |
| 3 | `$request_method` | Prom `method` label |
| 4 | `$request_uri` | `http_request_uri` (query stripped) |
| 5 | `$status` | `http_response` |
| 6 | `$body_bytes_sent`| Prom body histogram |
| 7 | `$request_time` | Prom duration histogram |
| 8 | `$is_tor` | `is_tor` |
| 9 | `$asn` | `asn` |
| 10| `$ipng_source_tag`| `source_tag` |
| 11| `$server_addr` | *(parsed and discarded)* |
| 12| `$scheme` | *(parsed and discarded)* |
| 6 | `$bytes_sent` | Prom `nginx_http_bytes_sent` |
| 7 | `$request_length` | Prom `nginx_http_request_bytes` |
| 8 | `$request_time` | Prom `nginx_http_request_duration_seconds` |
| 9 | `$upstream_response_time` | Prom `nginx_http_upstream_duration_seconds` |
| 10| `$upstream_status` | Prom `nginx_http_upstream_requests_total` |
| 11| `$is_tor` | `is_tor` |
| 12| `$asn` | `asn` |
| 13| `$ipng_source_tag` | `source_tag` |
| 14| `$server_addr` | *(parsed and discarded)* |
| 15| `$scheme` | *(parsed and discarded)* |
Compared to the file format: the version tag is added, `$msec` is dropped, and three fields
are appended — `$ipng_source_tag` (propagated into the data model), `$server_addr` and
`$scheme` (reserved for future use).
For requests served without an upstream (static files, redirects, errors), nginx emits
literal `-` for `$upstream_response_time` and `$upstream_status`; the parser treats those as
"no upstream" and skips the upstream metrics rather than counting them as zeros. When nginx
retries across multiple upstreams, both fields are comma-separated and the parser keeps the
last value (the upstream that ultimately served the response).
**Unknown `$is_tor` / `$asn`: emit `0`.** Same convention as the file format — operators
without TOR or GeoIP data can emit `0` for both columns and everything works. A literal `0`
in `$is_tor` is `false`; a literal `0` in `$asn` is ASN `0`, filterable at query time.
#### v1 (legacy)
All 13 fields are required for v1 — malformed packets (wrong version, wrong field count, bad
IP) are silently dropped and counted via `logtail_udp_packets_received_total` minus
`logtail_udp_loglines_success_total`. Both paths (file + UDP) can feed the same collector
simultaneously; they converge on the same aggregation pipeline.
v1 is preserved unchanged so existing emitters can be upgraded after the collector. Layout:
```nginx
log_format ipng_stats_logtail
'v1\t$host\t$remote_addr\t$request_method\t$request_uri\t$status\t'
'$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme';
```
12 tab-separated payload fields after the `v1` prefix. v1 fills `nginx_http_bytes_sent` from
`$body_bytes_sent`; v2 fills it from `$bytes_sent`. Operators will see a small step up in
that metric (header overhead, typically a few hundred bytes per response) when emitters move
to v2.
#### Required values
`$is_tor` is `1` if the client IP is a TOR exit node and `0` otherwise (typically populated
via a Lua script or `$geoip2_data_*`). `$asn` is the client AS number as a decimal integer
(e.g. MaxMind GeoIP2's `$geoip2_data_autonomous_system_number`). Operators without TOR or
GeoIP data MUST emit literal `0` for both — a literal `0` in `$is_tor` parses as `false`; a
literal `0` in `$asn` is ASN `0`, filterable at query time with `--asn '!=0'`.
`$ipng_source_tag` is provided by [`nginx-ipng-stats-plugin`](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin).
Operators not running the plugin SHOULD declare a constant via `set $ipng_source_tag direct;`
in their `server` block — there is no synthesised fallback in the collector.
#### Pointing the collector at logs
For file ingest, set `COLLECTOR_LOGS` to comma-separated paths or glob patterns. Make sure
the files are group-readable by `www-data` (the collector's primary group in the systemd
unit). For UDP ingest, the plugin's `ipng_stats_logtail udp://127.0.0.1:9514` line above is
sufficient. Both paths can feed the same collector simultaneously and converge on the same
aggregation pipeline. Malformed lines (wrong version, wrong field count, bad IP) are silently
dropped; for UDP they show up as `logtail_udp_packets_received_total` minus
`logtail_udp_loglines_success_total`.
---
@@ -303,21 +300,33 @@ the new file appears. No restart or SIGHUP required.
The collector exposes a Prometheus-compatible `/metrics` endpoint on `--prom-listen` (default
`:9100`). Set `--prom-listen ""` to disable it entirely.
**Per-host series:**
**Per-{host,source_tag} series** (both v1 and v2):
- `nginx_http_requests_total{host, method, status}` — counter. Map capped at 250 000 distinct
label sets; new entries beyond the cap are dropped until the map is rolled over.
- `nginx_http_response_body_bytes_{bucket,count,sum}{host, le}` — histogram of
`$body_bytes_sent`. Buckets (bytes): `256, 1024, 4096, 16384, 65536, 262144, 1048576, +Inf`.
- `nginx_http_request_duration_seconds_{bucket,count,sum}{host, le}` — histogram of
`$request_time`. Buckets (seconds): `0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5,
10, +Inf`. Not split by `source_tag` (duration histogram stays per-host to avoid cardinality
blow-up).
- `nginx_http_bytes_sent_{bucket,count,sum}{host, source_tag, le}` — histogram of response
size. v1 fills from `$body_bytes_sent`; v2 fills from `$bytes_sent`. Buckets (bytes):
`256, 1024, 4096, 16384, 65536, 262144, 1048576, +Inf`.
- `nginx_http_request_duration_seconds_{bucket,count,sum}{host, source_tag, le}` — histogram
of `$request_time`. Buckets (seconds): `0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5,
5, 10, +Inf`.
**Per-`source_tag` roll-ups** (parallel series, not a cross-product with `host`):
**v2-only series** (populated only when v2 emitters are running, and the upstream histograms
only when nginx involved an upstream):
- `nginx_http_requests_by_source_total{source_tag}`counter.
- `nginx_http_response_body_bytes_by_source_{bucket,count,sum}{source_tag, le}` — histogram.
- `nginx_http_request_bytes_{bucket,count,sum}{host, source_tag, le}`histogram of
`$request_length` (ingress, headers + body). Same byte buckets as `bytes_sent`.
- `nginx_http_upstream_duration_seconds_{bucket,count,sum}{host, source_tag, le}`
histogram of `$upstream_response_time`. Same time buckets as `request_duration`. Lets you
split end-to-end latency into upstream vs. nginx overhead.
- `nginx_http_upstream_requests_total{host, source_tag, status_class}` — counter incremented
once per upstream-served request, classed by `$upstream_status` (`2xx`/`3xx`/`4xx`/`5xx`/`other`).
Lets you spot upstream errors masked at the edge (e.g. nginx 502 because origin 504).
**Source-tag rollup** (fleet-wide attribution health, intentionally not crossed with host):
- `nginx_http_requests_by_source_total{source_tag, status_class}` — counter classed by
`$status`. Use it to spot per-source error spikes without exploding cardinality.
**UDP ingest counters** — lets operators distinguish parse failures from back-pressure drops:
@@ -365,10 +374,23 @@ histogram_quantile(0.95,
sum by (host, le) (rate(nginx_http_request_duration_seconds_bucket[5m]))
)
# Median response body size per host
histogram_quantile(0.50,
sum by (host, le) (rate(nginx_http_response_body_bytes_bucket[5m]))
# 95th percentile response time per source_tag (drill in further as needed)
histogram_quantile(0.95,
sum by (source_tag, le) (rate(nginx_http_request_duration_seconds_bucket[5m]))
)
# Median response size per host
histogram_quantile(0.50,
sum by (host, le) (rate(nginx_http_bytes_sent_bucket[5m]))
)
# v2-only: upstream P95, split out from nginx overhead
histogram_quantile(0.95,
sum by (host, le) (rate(nginx_http_upstream_duration_seconds_bucket[5m]))
)
# v2-only: upstream 5xx rate per source_tag
sum by (source_tag) (rate(nginx_http_upstream_requests_total{status_class="5xx"}[5m]))
```
### Memory usage