From 577ed3dad55dab47048d7e0a27058c3159e49054 Mon Sep 17 00:00:00 2001 From: Pim van Pelt Date: Fri, 17 Apr 2026 09:50:54 +0200 Subject: [PATCH] Refactor docs; Add 'ipng_source_tag', add udp listener for nginx-ipng-stats plugin --- README.md | 6 +- cmd/cli/flags.go | 13 +- cmd/cli/main.go | 5 +- cmd/collector/main.go | 22 +- cmd/collector/parser.go | 89 ++-- cmd/collector/parser_test.go | 95 ++++- cmd/collector/prom.go | 135 ++++-- cmd/collector/prom_test.go | 55 +++ cmd/collector/store.go | 2 +- cmd/collector/udp.go | 86 ++++ cmd/collector/udp_test.go | 67 +++ cmd/frontend/filter.go | 10 +- cmd/frontend/frontend_test.go | 11 +- cmd/frontend/handler.go | 28 +- docs/DETAILS.md | 528 ----------------------- docs/PLAN_AGGREGATOR.md | 250 ----------- docs/PLAN_CLI.md | 293 ------------- docs/PLAN_COLLECTOR.md | 144 ------- docs/PLAN_FRONTEND.md | 334 --------------- docs/design.md | 608 +++++++++++++++++++++++++++ docs/{USERGUIDE.md => user-guide.md} | 0 internal/store/store.go | 31 +- internal/store/store_test.go | 42 ++ proto/logtail.proto | 2 + proto/logtailpb/logtail.pb.go | 177 ++++---- proto/logtailpb/logtail_grpc.pb.go | 4 +- 26 files changed, 1319 insertions(+), 1718 deletions(-) create mode 100644 cmd/collector/udp.go create mode 100644 cmd/collector/udp_test.go delete mode 100644 docs/DETAILS.md delete mode 100644 docs/PLAN_AGGREGATOR.md delete mode 100644 docs/PLAN_CLI.md delete mode 100644 docs/PLAN_COLLECTOR.md delete mode 100644 docs/PLAN_FRONTEND.md create mode 100644 docs/design.md rename docs/{USERGUIDE.md => user-guide.md} (100%) diff --git a/README.md b/README.md index a1f2ed6..5794daf 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,5 @@ It's written in Go, and is meant to deploy collectors on any number of webserver aggregation and frontend logic. It's released under [[APACHE](LICENSE)] license. It can be run either as `systemd` units, or in Docker, or any combination of the two. -See [[User Guide](docs/USERGUIDE.md)] or [[DETAILS](docs/DETAILS.md)] for more information. - -The [[docs/](docs/)] directory contains extensive planning information which shows how Claude -Code single-shot implemented the whole system in March 2026. +See [[User Guide](docs/user-guide.md)] for operator-facing documentation, or +[[Design](docs/design.md)] for the normative requirements and architectural rationale. diff --git a/cmd/cli/flags.go b/cmd/cli/flags.go index 9518d84..5facc48 100644 --- a/cmd/cli/flags.go +++ b/cmd/cli/flags.go @@ -24,6 +24,7 @@ type sharedFlags struct { uriReNeg string // RE2 regex exclusion against request URI isTor string // "", "1" / "!=0" (TOR only), "0" / "!=1" (non-TOR only) asn string // expression: "12345", "!=65000", ">=1000", etc. + sourceTag string // exact ipng_source_tag match } // bindShared registers the shared flags on fs and returns a pointer to the @@ -42,6 +43,7 @@ func bindShared(fs *flag.FlagSet) (*sharedFlags, *string) { fs.StringVar(&sf.uriReNeg, "uri-re-neg", "", "filter: RE2 regex exclusion against request URI") fs.StringVar(&sf.isTor, "is-tor", "", "filter: TOR traffic (1 or !=0 = TOR only; 0 or !=1 = non-TOR only)") fs.StringVar(&sf.asn, "asn", "", "filter: ASN expression (12345, !=65000, >=1000, <64512, …)") + fs.StringVar(&sf.sourceTag, "source-tag", "", "filter: exact ipng_source_tag match (e.g. direct, cdn, …)") return sf, target } @@ -64,7 +66,7 @@ func parseTargets(s string) []string { } func buildFilter(sf *sharedFlags) *pb.Filter { - if sf.website == "" && sf.prefix == "" && sf.uri == "" && sf.status == "" && sf.websiteRe == "" && sf.uriRe == "" && sf.websiteReNeg == "" && sf.uriReNeg == "" && sf.isTor == "" && sf.asn == "" { + if sf.website == "" && sf.prefix == "" && sf.uri == "" && sf.status == "" && sf.websiteRe == "" && sf.uriRe == "" && sf.websiteReNeg == "" && sf.uriReNeg == "" && sf.isTor == "" && sf.asn == "" && sf.sourceTag == "" { return nil } f := &pb.Filter{} @@ -118,6 +120,9 @@ func buildFilter(sf *sharedFlags) *pb.Filter { f.AsnNumber = &n f.AsnOp = op } + if sf.sourceTag != "" { + f.IpngSourceTag = &sf.sourceTag + } return f } @@ -152,8 +157,12 @@ func parseGroupBy(s string) pb.GroupBy { return pb.GroupBy_REQUEST_URI case "status": return pb.GroupBy_HTTP_RESPONSE + case "asn": + return pb.GroupBy_ASN_NUMBER + case "source_tag", "source-tag": + return pb.GroupBy_SOURCE_TAG default: - fmt.Fprintf(os.Stderr, "--group-by: unknown value %q; valid: website prefix uri status\n", s) + fmt.Fprintf(os.Stderr, "--group-by: unknown value %q; valid: website prefix uri status asn source_tag\n", s) os.Exit(1) panic("unreachable") } diff --git a/cmd/cli/main.go b/cmd/cli/main.go index 3e08121..e123d01 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -22,11 +22,14 @@ Subcommand flags (all subcommands): --status EXPR filter: HTTP status expression (200, !=200, >=400, <500, …) --website-re REGEX filter: RE2 regex against website --uri-re REGEX filter: RE2 regex against request URI + --is-tor EXPR filter: TOR (1/!=0 = only, 0/!=1 = none) + --asn EXPR filter: ASN expression (12345, !=65000, …) + --source-tag STRING filter: exact ipng_source_tag match topn flags: --n INT number of entries (default 10) --window STR 1m 5m 15m 60m 6h 24h (default 5m) - --group-by STR website prefix uri status (default website) + --group-by STR website prefix uri status asn source_tag (default website) trend flags: --window STR 1m 5m 15m 60m 6h 24h (default 5m) diff --git a/cmd/collector/main.go b/cmd/collector/main.go index cc1dd35..a84f3cf 100644 --- a/cmd/collector/main.go +++ b/cmd/collector/main.go @@ -28,13 +28,17 @@ func main() { v4prefix := flag.Int("v4prefix", envOrInt("COLLECTOR_V4PREFIX", 24), "IPv4 prefix length for client bucketing (env: COLLECTOR_V4PREFIX)") v6prefix := flag.Int("v6prefix", envOrInt("COLLECTOR_V6PREFIX", 48), "IPv6 prefix length for client bucketing (env: COLLECTOR_V6PREFIX)") scanInterval := flag.Duration("scan-interval", envOrDuration("COLLECTOR_SCAN_INTERVAL", 10*time.Second), "how often to rescan glob patterns for new/removed files (env: COLLECTOR_SCAN_INTERVAL)") + logtailPort := flag.Int("logtail-port", envOrInt("COLLECTOR_LOGTAIL_PORT", 0), "UDP port to receive nginx ipng_stats_logtail packets, 0 to disable (env: COLLECTOR_LOGTAIL_PORT)") + logtailBind := flag.String("logtail-bind", envOr("COLLECTOR_LOGTAIL_BIND", "127.0.0.1"), "UDP bind address for the logtail listener (env: COLLECTOR_LOGTAIL_BIND)") flag.Parse() patterns := collectPatterns(*logPaths, *logsFile) - if len(patterns) == 0 { - log.Fatal("collector: no log paths specified; use --logs or --logs-file") + if len(patterns) == 0 && *logtailPort == 0 { + log.Fatal("collector: no inputs configured; use --logs, --logs-file, or --logtail-port") + } + if len(patterns) > 0 { + log.Printf("collector: watching %d pattern(s), rescan every %s", len(patterns), *scanInterval) } - log.Printf("collector: watching %d pattern(s), rescan every %s", len(patterns), *scanInterval) ctx, stop := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) defer stop() @@ -57,8 +61,16 @@ func main() { } go store.Run(ch) - tailer := NewMultiTailer(patterns, *scanInterval, *v4prefix, *v6prefix, ch) - go tailer.Run(ctx) + if len(patterns) > 0 { + tailer := NewMultiTailer(patterns, *scanInterval, *v4prefix, *v6prefix, ch) + go tailer.Run(ctx) + } + + if *logtailPort > 0 { + udp := NewUDPListener(net.JoinHostPort(*logtailBind, strconv.Itoa(*logtailPort)), *v4prefix, *v6prefix, ch) + udp.SetProm(store.prom) + go udp.Run(ctx) + } lis, err := net.Listen("tcp", *listen) if err != nil { diff --git a/cmd/collector/parser.go b/cmd/collector/parser.go index 8a48d18..3e90f37 100644 --- a/cmd/collector/parser.go +++ b/cmd/collector/parser.go @@ -18,65 +18,104 @@ type LogRecord struct { Method string BodyBytesSent int64 RequestTime float64 + SourceTag string } -// ParseLine parses a tab-separated logtail log line: +// fileSourceTag is the SourceTag assigned to records read from on-disk log +// files, which pre-date the tag concept. Mirrors nginx's fallback label. +const fileSourceTag = "direct" + +// ParseLine parses a tab-separated logtail log line from a file: // // $host \t $remote_addr \t $msec \t $request_method \t $request_uri \t $status \t $body_bytes_sent \t $request_time \t $is_tor \t $asn // // The is_tor (field 9) and asn (field 10) fields are optional for backward // compatibility with older log files that omit them; they default to false/0 -// when absent. +// when absent. SourceTag is always set to "direct" (file origin has no tag). // Returns false for lines with fewer than 8 fields. func ParseLine(line string, v4bits, v6bits int) (LogRecord, bool) { - // SplitN caps allocations; we need up to 10 fields. fields := strings.SplitN(line, "\t", 10) if len(fields) < 8 { return LogRecord{}, false } - - uri := fields[4] - if i := strings.IndexByte(uri, '?'); i >= 0 { - uri = uri[:i] - } - prefix, ok := truncateIP(fields[1], v4bits, v6bits) if !ok { return LogRecord{}, false } - isTor := len(fields) >= 9 && fields[8] == "1" - var asn int32 if len(fields) == 10 { if n, err := strconv.ParseInt(fields[9], 10, 32); err == nil { asn = int32(n) } } - - var bodyBytes int64 - if n, err := strconv.ParseInt(fields[6], 10, 64); err == nil { - bodyBytes = n - } - - var reqTime float64 - if f, err := strconv.ParseFloat(fields[7], 64); err == nil { - reqTime = f - } - return LogRecord{ Website: fields[0], ClientPrefix: prefix, - URI: uri, + URI: stripQuery(fields[4]), Status: fields[5], IsTor: isTor, ASN: asn, Method: fields[3], - BodyBytesSent: bodyBytes, - RequestTime: reqTime, + BodyBytesSent: parseInt(fields[6]), + RequestTime: parseFloat(fields[7]), + SourceTag: fileSourceTag, }, true } +// ParseUDPLine parses a tab-separated logtail log line from the UDP listener: +// +// $host \t $remote_addr \t $request_method \t $request_uri \t $status \t +// $body_bytes_sent \t $request_time \t $is_tor \t $asn \t +// $ipng_source_tag \t $server_addr \t $scheme +// +// All 12 fields are required. server_addr and scheme are consumed but not +// propagated. Returns false for any malformed packet (wrong field count, +// bad IP). +func ParseUDPLine(line string, v4bits, v6bits int) (LogRecord, bool) { + fields := strings.Split(line, "\t") + if len(fields) != 12 { + return LogRecord{}, false + } + prefix, ok := truncateIP(fields[1], v4bits, v6bits) + if !ok { + return LogRecord{}, false + } + var asn int32 + if n, err := strconv.ParseInt(fields[8], 10, 32); err == nil { + asn = int32(n) + } + return LogRecord{ + Website: fields[0], + ClientPrefix: prefix, + URI: stripQuery(fields[3]), + Status: fields[4], + IsTor: fields[7] == "1", + ASN: asn, + Method: fields[2], + BodyBytesSent: parseInt(fields[5]), + RequestTime: parseFloat(fields[6]), + SourceTag: fields[9], + }, true +} + +func stripQuery(uri string) string { + if i := strings.IndexByte(uri, '?'); i >= 0 { + return uri[:i] + } + return uri +} + +func parseInt(s string) int64 { + n, _ := strconv.ParseInt(s, 10, 64) + return n +} + +func parseFloat(s string) float64 { + f, _ := strconv.ParseFloat(s, 64) + return f +} + // truncateIP masks addr to the given prefix length depending on IP version. // Returns the CIDR string (e.g. "1.2.3.0/24") and true on success. func truncateIP(addr string, v4bits, v6bits int) (string, bool) { diff --git a/cmd/collector/parser_test.go b/cmd/collector/parser_test.go index eb7c29b..c7e755e 100644 --- a/cmd/collector/parser_test.go +++ b/cmd/collector/parser_test.go @@ -25,6 +25,7 @@ func TestParseLine(t *testing.T) { Method: "GET", BodyBytesSent: 1452, RequestTime: 0.043, + SourceTag: "direct", }, }, { @@ -38,6 +39,7 @@ func TestParseLine(t *testing.T) { Status: "201", Method: "POST", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -46,11 +48,12 @@ func TestParseLine(t *testing.T) { wantOK: true, want: LogRecord{ Website: "host", - ClientPrefix: "2001:db8:cafe::/48", // /48 = 3 full 16-bit groups intact + ClientPrefix: "2001:db8:cafe::/48", URI: "/", Status: "200", Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -79,6 +82,7 @@ func TestParseLine(t *testing.T) { Status: "429", Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -93,6 +97,7 @@ func TestParseLine(t *testing.T) { IsTor: true, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -107,6 +112,7 @@ func TestParseLine(t *testing.T) { IsTor: false, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -121,6 +127,7 @@ func TestParseLine(t *testing.T) { IsTor: false, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -136,6 +143,7 @@ func TestParseLine(t *testing.T) { ASN: 12345, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -151,6 +159,7 @@ func TestParseLine(t *testing.T) { ASN: 65535, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -166,6 +175,7 @@ func TestParseLine(t *testing.T) { ASN: 0, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, { @@ -181,6 +191,7 @@ func TestParseLine(t *testing.T) { ASN: 0, Method: "GET", RequestTime: 0.001, + SourceTag: "direct", }, }, } @@ -201,6 +212,84 @@ func TestParseLine(t *testing.T) { } } +func TestParseUDPLine(t *testing.T) { + // host \t remote_addr \t method \t uri \t status \t body_bytes \t req_time \t + // is_tor \t asn \t source_tag \t server_addr \t scheme + good := "www.example.com\t1.2.3.4\tGET\t/api/v1/search?q=foo\t200\t1452\t0.043\t0\t12345\tcdn\t10.0.0.1\thttps" + + tests := []struct { + name string + line string + wantOK bool + want LogRecord + }{ + { + name: "all 12 fields parsed, query stripped, extras dropped", + line: good, + wantOK: true, + want: LogRecord{ + Website: "www.example.com", + ClientPrefix: "1.2.3.0/24", + URI: "/api/v1/search", + Status: "200", + IsTor: false, + ASN: 12345, + Method: "GET", + BodyBytesSent: 1452, + RequestTime: 0.043, + SourceTag: "cdn", + }, + }, + { + name: "is_tor=1, tag direct, IPv6", + line: "h\t2001:db8::1\tGET\t/\t200\t0\t0\t1\t65535\tdirect\t::1\thttp", + wantOK: true, + want: LogRecord{ + Website: "h", + ClientPrefix: "2001:db8::/48", + URI: "/", + Status: "200", + IsTor: true, + ASN: 65535, + Method: "GET", + BodyBytesSent: 0, + RequestTime: 0, + SourceTag: "direct", + }, + }, + { + name: "11 fields rejected", + line: "h\t1.2.3.4\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1", + wantOK: false, + }, + { + name: "13 fields rejected", + line: good + "\textra", + wantOK: false, + }, + { + name: "bad IP rejected", + line: "h\tnope\tGET\t/\t200\t0\t0\t0\t0\ttag\t10.0.0.1\thttp", + wantOK: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, ok := ParseUDPLine(tc.line, 24, 48) + if ok != tc.wantOK { + t.Fatalf("ParseUDPLine ok=%v, want %v; got=%+v", ok, tc.wantOK, got) + } + if !tc.wantOK { + return + } + if got != tc.want { + t.Errorf("got %+v, want %+v", got, tc.want) + } + }) + } +} + func TestTruncateIP(t *testing.T) { tests := []struct { addr string @@ -208,8 +297,8 @@ func TestTruncateIP(t *testing.T) { }{ {"1.2.3.4", "1.2.3.0/24"}, {"192.168.100.200", "192.168.100.0/24"}, - {"2001:db8:cafe:babe::1", "2001:db8:cafe::/48"}, // /48 = 3 full groups intact - {"::1", "::/48"}, // loopback — first 48 bits are all zero + {"2001:db8:cafe:babe::1", "2001:db8:cafe::/48"}, + {"::1", "::/48"}, } for _, tc := range tests { diff --git a/cmd/collector/prom.go b/cmd/collector/prom.go index b487740..916100c 100644 --- a/cmd/collector/prom.go +++ b/cmd/collector/prom.go @@ -19,7 +19,7 @@ const promNumTimeBounds = 11 var promTimeBounds = [promNumTimeBounds]float64{.005, .01, .025, .05, .1, .25, .5, 1, 2.5, 5, 10} -const promCounterCap = 100_000 // safety cap on {host,method,status} counter entries +const promCounterCap = 250_000 // safety cap on {host,method,status} counter entries // promCounterKey is the label set for per-request counters. type promCounterKey struct { @@ -49,14 +49,26 @@ type PromStore struct { counters map[promCounterKey]int64 body map[string]*promBodyEntry // keyed by host reqTime map[string]*promTimeEntry // keyed by host + + // per-source_tag rollups (parallel series, not a cross-product with host) + sourceCounters map[string]int64 // keyed by source_tag + sourceBody map[string]*promBodyEntry // keyed by source_tag + + // UDP ingest counters — protected by their own atomic-friendly lock. + udpMu sync.Mutex + udpPacketsReceived int64 // datagrams read off the socket + udpLoglinesSuccess int64 // successfully parsed + udpLoglinesConsumed int64 // successfully forwarded to the store channel } // NewPromStore returns an empty PromStore ready for use. func NewPromStore() *PromStore { return &PromStore{ - counters: make(map[promCounterKey]int64, 1024), - body: make(map[string]*promBodyEntry, 64), - reqTime: make(map[string]*promTimeEntry, 64), + counters: make(map[promCounterKey]int64, 1024), + body: make(map[string]*promBodyEntry, 64), + reqTime: make(map[string]*promTimeEntry, 64), + sourceCounters: make(map[string]int64, 32), + sourceBody: make(map[string]*promBodyEntry, 32), } } @@ -74,18 +86,7 @@ func (p *PromStore) Ingest(r LogRecord) { } // --- body_bytes_sent histogram (keyed by host only) --- - be, ok := p.body[r.Website] - if !ok { - be = &promBodyEntry{} - p.body[r.Website] = be - } - for i, bound := range promBodyBounds { - if r.BodyBytesSent <= bound { - be.buckets[i]++ - } - } - be.buckets[promNumBodyBounds]++ // +Inf - be.sum += r.BodyBytesSent + observeBody(p.body, r.Website, r.BodyBytesSent) // --- request_time histogram (keyed by host only) --- te, ok := p.reqTime[r.Website] @@ -101,9 +102,34 @@ func (p *PromStore) Ingest(r LogRecord) { te.buckets[promNumTimeBounds]++ // +Inf te.sum += r.RequestTime + // --- per-source_tag rollups --- + p.sourceCounters[r.SourceTag]++ + observeBody(p.sourceBody, r.SourceTag, r.BodyBytesSent) + p.mu.Unlock() } +// IncUDPPacket, IncUDPSuccess, and IncUDPConsumed bump their respective +// UDP ingest counters. They are called from the UDP listener goroutine. +func (p *PromStore) IncUDPPacket() { p.udpMu.Lock(); p.udpPacketsReceived++; p.udpMu.Unlock() } +func (p *PromStore) IncUDPSuccess() { p.udpMu.Lock(); p.udpLoglinesSuccess++; p.udpMu.Unlock() } +func (p *PromStore) IncUDPConsumed() { p.udpMu.Lock(); p.udpLoglinesConsumed++; p.udpMu.Unlock() } + +func observeBody(m map[string]*promBodyEntry, key string, bytes int64) { + e, ok := m[key] + if !ok { + e = &promBodyEntry{} + m[key] = e + } + for i, bound := range promBodyBounds { + if bytes <= bound { + e.buckets[i]++ + } + } + e.buckets[promNumBodyBounds]++ // +Inf + e.sum += bytes +} + // ServeHTTP renders all metrics in the Prometheus text exposition format (0.0.4). func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { // Snapshot everything under the lock, then render without holding it. @@ -119,8 +145,8 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { } type bodySnap struct { - host string - e promBodyEntry + label string + e promBodyEntry } bodySnaps := make([]bodySnap, 0, len(p.body)) for h, e := range p.body { @@ -136,8 +162,27 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { timeSnaps = append(timeSnaps, timeSnap{h, *e}) } + type sourceCounterSnap struct { + tag string + v int64 + } + sourceCounters := make([]sourceCounterSnap, 0, len(p.sourceCounters)) + for t, v := range p.sourceCounters { + sourceCounters = append(sourceCounters, sourceCounterSnap{t, v}) + } + sourceBodySnaps := make([]bodySnap, 0, len(p.sourceBody)) + for t, e := range p.sourceBody { + sourceBodySnaps = append(sourceBodySnaps, bodySnap{t, *e}) + } + p.mu.Unlock() + p.udpMu.Lock() + udpPackets := p.udpPacketsReceived + udpSuccess := p.udpLoglinesSuccess + udpConsumed := p.udpLoglinesConsumed + p.udpMu.Unlock() + // Sort for stable, human-readable output. sort.Slice(counters, func(i, j int) bool { a, b := counters[i].k, counters[j].k @@ -149,8 +194,10 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { } return a.Status < b.Status }) - sort.Slice(bodySnaps, func(i, j int) bool { return bodySnaps[i].host < bodySnaps[j].host }) + sort.Slice(bodySnaps, func(i, j int) bool { return bodySnaps[i].label < bodySnaps[j].label }) sort.Slice(timeSnaps, func(i, j int) bool { return timeSnaps[i].host < timeSnaps[j].host }) + sort.Slice(sourceCounters, func(i, j int) bool { return sourceCounters[i].tag < sourceCounters[j].tag }) + sort.Slice(sourceBodySnaps, func(i, j int) bool { return sourceBodySnaps[i].label < sourceBodySnaps[j].label }) w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") bw := bufio.NewWriterSize(w, 256*1024) @@ -167,16 +214,7 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { fmt.Fprintln(bw, "# HELP nginx_http_response_body_bytes HTTP response body size distribution in bytes.") fmt.Fprintln(bw, "# TYPE nginx_http_response_body_bytes histogram") for _, s := range bodySnaps { - for i, bound := range promBodyBounds { - fmt.Fprintf(bw, "nginx_http_response_body_bytes_bucket{host=%q,le=%q} %d\n", - s.host, fmt.Sprintf("%d", bound), s.e.buckets[i]) - } - fmt.Fprintf(bw, "nginx_http_response_body_bytes_bucket{host=%q,le=\"+Inf\"} %d\n", - s.host, s.e.buckets[promNumBodyBounds]) - fmt.Fprintf(bw, "nginx_http_response_body_bytes_count{host=%q} %d\n", - s.host, s.e.buckets[promNumBodyBounds]) - fmt.Fprintf(bw, "nginx_http_response_body_bytes_sum{host=%q} %d\n", - s.host, s.e.sum) + writeBodyHistogram(bw, "nginx_http_response_body_bytes", "host", s.label, s.e) } // nginx_http_request_duration_seconds (histogram, labeled by host) @@ -195,9 +233,48 @@ func (p *PromStore) ServeHTTP(w http.ResponseWriter, _ *http.Request) { s.host, s.e.sum) } + // nginx_http_requests_by_source_total (counter, labeled by source_tag) + fmt.Fprintln(bw, "# HELP nginx_http_requests_by_source_total HTTP requests rolled up by nginx source tag.") + fmt.Fprintln(bw, "# TYPE nginx_http_requests_by_source_total counter") + for _, c := range sourceCounters { + fmt.Fprintf(bw, "nginx_http_requests_by_source_total{source_tag=%q} %d\n", c.tag, c.v) + } + + // nginx_http_response_body_bytes_by_source (histogram, labeled by source_tag) + fmt.Fprintln(bw, "# HELP nginx_http_response_body_bytes_by_source HTTP response body size distribution by nginx source tag.") + fmt.Fprintln(bw, "# TYPE nginx_http_response_body_bytes_by_source histogram") + for _, s := range sourceBodySnaps { + writeBodyHistogram(bw, "nginx_http_response_body_bytes_by_source", "source_tag", s.label, s.e) + } + + // UDP ingest counters — lets operators distinguish parse failures + // (received - success) from channel-full drops (success - consumed). + fmt.Fprintln(bw, "# HELP logtail_udp_packets_received_total Datagrams read from the UDP socket.") + fmt.Fprintln(bw, "# TYPE logtail_udp_packets_received_total counter") + fmt.Fprintf(bw, "logtail_udp_packets_received_total %d\n", udpPackets) + fmt.Fprintln(bw, "# HELP logtail_udp_loglines_success_total UDP loglines that parsed successfully.") + fmt.Fprintln(bw, "# TYPE logtail_udp_loglines_success_total counter") + fmt.Fprintf(bw, "logtail_udp_loglines_success_total %d\n", udpSuccess) + fmt.Fprintln(bw, "# HELP logtail_udp_loglines_consumed_total UDP loglines forwarded to the store (not dropped).") + fmt.Fprintln(bw, "# TYPE logtail_udp_loglines_consumed_total counter") + fmt.Fprintf(bw, "logtail_udp_loglines_consumed_total %d\n", udpConsumed) + bw.Flush() } +func writeBodyHistogram(bw *bufio.Writer, metric, labelName, labelValue string, e promBodyEntry) { + for i, bound := range promBodyBounds { + fmt.Fprintf(bw, "%s_bucket{%s=%q,le=%q} %d\n", + metric, labelName, labelValue, fmt.Sprintf("%d", bound), e.buckets[i]) + } + fmt.Fprintf(bw, "%s_bucket{%s=%q,le=\"+Inf\"} %d\n", + metric, labelName, labelValue, e.buckets[promNumBodyBounds]) + fmt.Fprintf(bw, "%s_count{%s=%q} %d\n", + metric, labelName, labelValue, e.buckets[promNumBodyBounds]) + fmt.Fprintf(bw, "%s_sum{%s=%q} %d\n", + metric, labelName, labelValue, e.sum) +} + // formatFloat renders a float64 bucket bound without trailing zeros but always // with at least one decimal place, matching Prometheus convention (e.g. "0.5", "10"). func formatFloat(f float64) string { diff --git a/cmd/collector/prom_test.go b/cmd/collector/prom_test.go index 39f9b26..d005544 100644 --- a/cmd/collector/prom_test.go +++ b/cmd/collector/prom_test.go @@ -110,6 +110,61 @@ func TestPromStoreServeHTTP(t *testing.T) { } } +func TestPromStoreSourceTagRollup(t *testing.T) { + ps := NewPromStore() + // same host, two tags; each tag should appear with its own series. + ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 100, SourceTag: "direct"}) + ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 300, SourceTag: "cdn"}) + ps.Ingest(LogRecord{Website: "h", Method: "GET", Status: "200", BodyBytesSent: 100, SourceTag: "cdn"}) + + req := httptest.NewRequest("GET", "/metrics", nil) + rec := httptest.NewRecorder() + ps.ServeHTTP(rec, req) + body := rec.Body.String() + + checks := []string{ + "# TYPE nginx_http_requests_by_source_total counter", + `nginx_http_requests_by_source_total{source_tag="direct"} 1`, + `nginx_http_requests_by_source_total{source_tag="cdn"} 2`, + "# TYPE nginx_http_response_body_bytes_by_source histogram", + `nginx_http_response_body_bytes_by_source_sum{source_tag="direct"} 100`, + `nginx_http_response_body_bytes_by_source_sum{source_tag="cdn"} 400`, + // host-series totals are unchanged (one row, counting 3 requests). + `nginx_http_requests_total{host="h",method="GET",status="200"} 3`, + } + for _, want := range checks { + if !strings.Contains(body, want) { + t.Errorf("missing %q in output:\n%s", want, body) + } + } +} + +func TestPromStoreUDPCounters(t *testing.T) { + ps := NewPromStore() + ps.IncUDPPacket() + ps.IncUDPPacket() + ps.IncUDPPacket() + ps.IncUDPSuccess() + ps.IncUDPSuccess() + ps.IncUDPConsumed() + + req := httptest.NewRequest("GET", "/metrics", nil) + rec := httptest.NewRecorder() + ps.ServeHTTP(rec, req) + body := rec.Body.String() + + checks := []string{ + "logtail_udp_packets_received_total 3", + "logtail_udp_loglines_success_total 2", + "logtail_udp_loglines_consumed_total 1", + } + for _, want := range checks { + if !strings.Contains(body, want) { + t.Errorf("missing %q in output:\n%s", want, body) + } + } +} + func TestPromStoreCounterCap(t *testing.T) { ps := NewPromStore() // Fill to cap with distinct {host,method,status} combos diff --git a/cmd/collector/store.go b/cmd/collector/store.go index e17e26d..0dbd1ea 100644 --- a/cmd/collector/store.go +++ b/cmd/collector/store.go @@ -48,7 +48,7 @@ func (s *Store) ingest(r LogRecord) { if s.prom != nil { s.prom.Ingest(r) } - key := st.Tuple6{Website: r.Website, Prefix: r.ClientPrefix, URI: r.URI, Status: r.Status, IsTor: r.IsTor, ASN: r.ASN} + key := st.Tuple6{Website: r.Website, Prefix: r.ClientPrefix, URI: r.URI, Status: r.Status, IsTor: r.IsTor, ASN: r.ASN, SourceTag: r.SourceTag} if _, exists := s.live[key]; !exists { if s.liveLen >= liveMapCap { return diff --git a/cmd/collector/udp.go b/cmd/collector/udp.go new file mode 100644 index 0000000..9ad6404 --- /dev/null +++ b/cmd/collector/udp.go @@ -0,0 +1,86 @@ +package main + +import ( + "context" + "log" + "net" + "strings" +) + +// udpReadBufBytes is the SO_RCVBUF size requested. Bursts of ~10K lines/sec at +// ~200B each comfortably fit; the kernel may cap below this. +const udpReadBufBytes = 4 << 20 + +// udpPacketBuf is the per-read buffer. A single nginx log line easily fits in +// a few KB; 64K is the practical UDP datagram ceiling. +const udpPacketBuf = 64 << 10 + +// UDPListener receives nginx_ipng_stats_logtail datagrams on a local socket, +// parses each packet as one log line, and forwards LogRecords to ch. +type UDPListener struct { + addr string + v4bits int + v6bits int + ch chan<- LogRecord + prom *PromStore // optional; bumps UDP ingest counters +} + +func NewUDPListener(addr string, v4bits, v6bits int, ch chan<- LogRecord) *UDPListener { + return &UDPListener{addr: addr, v4bits: v4bits, v6bits: v6bits, ch: ch} +} + +// SetProm wires a PromStore so the listener can report received/success/consumed counts. +func (u *UDPListener) SetProm(p *PromStore) { u.prom = p } + +// Run listens until ctx is cancelled. +func (u *UDPListener) Run(ctx context.Context) { + laddr, err := net.ResolveUDPAddr("udp", u.addr) + if err != nil { + log.Fatalf("udp: resolve %s: %v", u.addr, err) + } + conn, err := net.ListenUDP("udp", laddr) + if err != nil { + log.Fatalf("udp: listen %s: %v", u.addr, err) + } + defer conn.Close() + if err := conn.SetReadBuffer(udpReadBufBytes); err != nil { + log.Printf("udp: SetReadBuffer(%d): %v", udpReadBufBytes, err) + } + log.Printf("udp: listening on %s", conn.LocalAddr()) + + go func() { + <-ctx.Done() + conn.Close() + }() + + buf := make([]byte, udpPacketBuf) + for { + n, _, err := conn.ReadFromUDP(buf) + if err != nil { + if ctx.Err() != nil { + return + } + log.Printf("udp: read: %v", err) + continue + } + if u.prom != nil { + u.prom.IncUDPPacket() + } + line := strings.TrimRight(string(buf[:n]), "\r\n") + rec, ok := ParseUDPLine(line, u.v4bits, u.v6bits) + if !ok { + continue + } + if u.prom != nil { + u.prom.IncUDPSuccess() + } + select { + case u.ch <- rec: + if u.prom != nil { + u.prom.IncUDPConsumed() + } + default: + // Channel full — drop rather than block the read loop. + } + } +} diff --git a/cmd/collector/udp_test.go b/cmd/collector/udp_test.go new file mode 100644 index 0000000..6105ad9 --- /dev/null +++ b/cmd/collector/udp_test.go @@ -0,0 +1,67 @@ +package main + +import ( + "context" + "net" + "testing" + "time" +) + +func TestUDPListenerRoundTrip(t *testing.T) { + ch := make(chan LogRecord, 4) + ps := NewPromStore() + + // Bind to an ephemeral port on loopback. + pc, err := net.ListenPacket("udp", "127.0.0.1:0") + if err != nil { + t.Fatalf("listen probe: %v", err) + } + addr := pc.LocalAddr().String() + pc.Close() // release; listener will re-bind + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + u := NewUDPListener(addr, 24, 48, ch) + u.SetProm(ps) + go u.Run(ctx) + + // Dial the listener and send one valid and one malformed packet. + conn, err := net.Dial("udp", addr) + if err != nil { + t.Fatalf("dial: %v", err) + } + defer conn.Close() + + // The listener is started asynchronously; retry for up to 1s. + good := "www.example.com\t1.2.3.4\tGET\t/\t200\t42\t0.010\t0\t12345\tdirect\t10.0.0.1\thttps" + bad := "not enough\tfields" + deadline := time.Now().Add(time.Second) + for time.Now().Before(deadline) { + conn.Write([]byte(good)) + conn.Write([]byte(bad)) + select { + case rec := <-ch: + if rec.Website != "www.example.com" || rec.SourceTag != "direct" { + t.Fatalf("bad record: %+v", rec) + } + // Give the listener a moment to process the malformed packet too. + time.Sleep(50 * time.Millisecond) + ps.udpMu.Lock() + pkt, suc, con := ps.udpPacketsReceived, ps.udpLoglinesSuccess, ps.udpLoglinesConsumed + ps.udpMu.Unlock() + if pkt < 2 { + t.Errorf("udpPacketsReceived=%d, want >=2", pkt) + } + if suc < 1 { + t.Errorf("udpLoglinesSuccess=%d, want >=1", suc) + } + if con < 1 { + t.Errorf("udpLoglinesConsumed=%d, want >=1", con) + } + return + case <-time.After(50 * time.Millisecond): + } + } + t.Fatal("no record received within 1s") +} diff --git a/cmd/frontend/filter.go b/cmd/frontend/filter.go index ff797fe..f148c8e 100644 --- a/cmd/frontend/filter.go +++ b/cmd/frontend/filter.go @@ -146,8 +146,13 @@ func applyTerm(term string, fs *filterState) error { return fmt.Errorf("invalid asn expression %q", expr) } fs.ASN = expr + case "source_tag": + if op != "=" { + return fmt.Errorf("source_tag only supports =, not %q", op) + } + fs.SourceTag = value default: - return fmt.Errorf("unknown field %q; valid: status, website, uri, prefix, is_tor, asn", field) + return fmt.Errorf("unknown field %q; valid: status, website, uri, prefix, is_tor, asn, source_tag", field) } return nil } @@ -196,6 +201,9 @@ func FilterExprString(f filterState) string { if f.ASN != "" { parts = append(parts, asnTermStr(f.ASN)) } + if f.SourceTag != "" { + parts = append(parts, "source_tag="+quoteMaybe(f.SourceTag)) + } return strings.Join(parts, " AND ") } diff --git a/cmd/frontend/frontend_test.go b/cmd/frontend/frontend_test.go index 6797dbc..f76e2aa 100644 --- a/cmd/frontend/frontend_test.go +++ b/cmd/frontend/frontend_test.go @@ -229,8 +229,17 @@ func TestDrillURL(t *testing.T) { if !strings.Contains(u, "f_asn=12345") { t.Errorf("drill from asn: missing f_asn in %q", u) } + if !strings.Contains(u, "by=source_tag") { + t.Errorf("drill from asn: expected next by=source_tag in %q", u) + } + + p.GroupByS = "source_tag" + u = p.drillURL("direct") + if !strings.Contains(u, "f_source_tag=direct") { + t.Errorf("drill from source_tag: missing f_source_tag in %q", u) + } if !strings.Contains(u, "by=website") { - t.Errorf("drill from asn: expected cycle back to by=website in %q", u) + t.Errorf("drill from source_tag: expected cycle back to by=website in %q", u) } } diff --git a/cmd/frontend/handler.go b/cmd/frontend/handler.go index 79d62d3..9ba6b0b 100644 --- a/cmd/frontend/handler.go +++ b/cmd/frontend/handler.go @@ -58,6 +58,7 @@ type filterState struct { URIReNeg string // RE2 regex exclusion against request URI IsTor string // "", "1" (TOR only), "0" (non-TOR only) ASN string // expression: "12345", "!=65000", ">=1000", etc. + SourceTag string // exact ipng_source_tag match } // QueryParams holds all parsed URL parameters for one page request. @@ -95,7 +96,7 @@ var windowSpecs = []struct{ s, label string }{ } var groupBySpecs = []struct{ s, label string }{ - {"website", "website"}, {"asn", "asn"}, {"prefix", "prefix"}, {"status", "status"}, {"uri", "uri"}, + {"website", "website"}, {"asn", "asn"}, {"prefix", "prefix"}, {"status", "status"}, {"uri", "uri"}, {"source_tag", "source"}, } func parseWindowString(s string) (pb.Window, string) { @@ -127,6 +128,8 @@ func parseGroupByString(s string) (pb.GroupBy, string) { return pb.GroupBy_HTTP_RESPONSE, "status" case "asn": return pb.GroupBy_ASN_NUMBER, "asn" + case "source_tag": + return pb.GroupBy_SOURCE_TAG, "source_tag" default: return pb.GroupBy_WEBSITE, "website" } @@ -168,12 +171,13 @@ func (h *Handler) parseParams(r *http.Request) QueryParams { URIReNeg: q.Get("f_uri_re_neg"), IsTor: q.Get("f_is_tor"), ASN: q.Get("f_asn"), + SourceTag: q.Get("f_source_tag"), }, } } func buildFilter(f filterState) *pb.Filter { - if f.Website == "" && f.Prefix == "" && f.URI == "" && f.Status == "" && f.WebsiteRe == "" && f.URIRe == "" && f.WebsiteReNeg == "" && f.URIReNeg == "" && f.IsTor == "" && f.ASN == "" { + if f.Website == "" && f.Prefix == "" && f.URI == "" && f.Status == "" && f.WebsiteRe == "" && f.URIRe == "" && f.WebsiteReNeg == "" && f.URIReNeg == "" && f.IsTor == "" && f.ASN == "" && f.SourceTag == "" { return nil } out := &pb.Filter{} @@ -216,6 +220,9 @@ func buildFilter(f filterState) *pb.Filter { out.AsnOp = op } } + if f.SourceTag != "" { + out.IpngSourceTag = &f.SourceTag + } return out } @@ -256,6 +263,9 @@ func (p QueryParams) toValues() url.Values { if p.Filter.ASN != "" { v.Set("f_asn", p.Filter.ASN) } + if p.Filter.SourceTag != "" { + v.Set("f_source_tag", p.Filter.SourceTag) + } return v } @@ -278,7 +288,7 @@ func (p QueryParams) clearFilterURL() string { return p.buildURL(map[string]string{ "f_website": "", "f_prefix": "", "f_uri": "", "f_status": "", "f_website_re": "", "f_uri_re": "", "f_website_re_neg": "", "f_uri_re_neg": "", - "f_is_tor": "", "f_asn": "", + "f_is_tor": "", "f_asn": "", "f_source_tag": "", }) } @@ -293,7 +303,9 @@ func nextGroupBy(s string) string { return "status" case "status": return "asn" - default: // asn → back to website + case "asn": + return "source_tag" + default: // source_tag → back to website return "website" } } @@ -311,6 +323,8 @@ func groupByFilterKey(s string) string { return "f_status" case "asn": return "f_asn" + case "source_tag": + return "f_source_tag" default: return "f_website" } @@ -391,6 +405,12 @@ func buildCrumbs(p QueryParams) []Crumb { RemoveURL: p.buildURL(map[string]string{"f_asn": ""}), }) } + if p.Filter.SourceTag != "" { + crumbs = append(crumbs, Crumb{ + Text: "source_tag=" + p.Filter.SourceTag, + RemoveURL: p.buildURL(map[string]string{"f_source_tag": ""}), + }) + } return crumbs } diff --git a/docs/DETAILS.md b/docs/DETAILS.md deleted file mode 100644 index 11ff58e..0000000 --- a/docs/DETAILS.md +++ /dev/null @@ -1,528 +0,0 @@ -PREAMBLE - -Although this computer program has a permissive license (AP2.0), if you came here looking to ask -questions, you're better off just moving on :) This program is shared AS-IS and really without any -intent for anybody but IPng Networks to use it. Also, in case the structure of the repo and the -style of this README wasn't already clear, this program is 100% written and maintained by Claude -Code. - -You have been warned :) - -SPECIFICATION - -This project contains four programs: - -1) A **collector** that tails any number of nginx log files and maintains an in-memory structure of -`{website, client_prefix, http_request_uri, http_response, is_tor, asn}` counts across all files. -It answers TopN and Trend queries via gRPC and pushes minute snapshots to the aggregator via -server-streaming. It also exposes a Prometheus `/metrics` endpoint (default `:9100`) with per-host -request counters and response-body/request-time histograms. -Runs on each nginx machine in the cluster. No UI — gRPC and HTTP interfaces only. - -2) An **aggregator** that subscribes to the snapshot stream from all collectors, merges their data -into a unified in-memory cache, and exposes the same gRPC interface. Answers questions like "what -is the busiest website globally", "which client prefix is causing the most HTTP 503s", and shows -trending information useful for DDoS detection. Runs on a central machine. - -3) An **HTTP frontend** companion to the aggregator that renders a drilldown dashboard. Operators -can restrict by `http_response=429`, then by `website=www.example.com`, and so on. Works with -either a collector or aggregator as its backend. Zero JavaScript — server-rendered HTML with inline -SVG sparklines and meta-refresh. - -4) A **CLI** for shell-based debugging. Sends `topn`, `trend`, and `stream` queries to any -collector or aggregator, fans out to multiple targets in parallel, and outputs human-readable -tables or newline-delimited JSON. - -Programs are written in Go. No CGO, no external runtime dependencies. - ---- - -![nginx-logtail frontend](docs/frontend.png) - ---- - -DEPLOYMENT - -## Docker - -All four binaries are published in a single image: `git.ipng.ch/ipng/nginx-logtail`. - -The image is built with a two-stage Dockerfile: a `golang:1.24-alpine` builder produces -statically-linked, stripped binaries (`CGO_ENABLED=0`, `-trimpath -ldflags="-s -w"`); the final -stage is `scratch` — no OS, no shell, no runtime dependencies. Each binary is invoked explicitly -via the container `command`. - -### Build and push - -``` -docker compose build --push -``` - -### Running aggregator + frontend - -The `docker-compose.yml` in the repo root runs the aggregator and frontend together. At minimum, -set `AGGREGATOR_COLLECTORS` to the comma-separated `host:port` list of your collector(s): - -```sh -AGGREGATOR_COLLECTORS=nginx1:9090,nginx2:9090 docker compose up -d -``` - -The frontend reaches the aggregator at `aggregator:9091` via Docker's internal DNS. The frontend -UI is available on port `8080`. - -### Environment variables - -All flags have environment variable equivalents. CLI flags take precedence over env vars. - -**collector** (runs on each nginx host, not in Docker): - -| Env var | Flag | Default | -|--------------------------|-------------------|-------------| -| `COLLECTOR_LISTEN` | `-listen` | `:9090` | -| `COLLECTOR_PROM_LISTEN` | `-prom-listen` | `:9100` | -| `COLLECTOR_LOGS` | `-logs` | — | -| `COLLECTOR_LOGS_FILE` | `-logs-file` | — | -| `COLLECTOR_SOURCE` | `-source` | hostname | -| `COLLECTOR_V4PREFIX` | `-v4prefix` | `24` | -| `COLLECTOR_V6PREFIX` | `-v6prefix` | `48` | -| `COLLECTOR_SCAN_INTERVAL`| `-scan-interval` | `10s` | - -**aggregator**: - -| Env var | Flag | Default | -|--------------------------|---------------|-------------| -| `AGGREGATOR_LISTEN` | `-listen` | `:9091` | -| `AGGREGATOR_COLLECTORS` | `-collectors` | — (required)| -| `AGGREGATOR_SOURCE` | `-source` | hostname | - -**frontend**: - -| Env var | Flag | Default | -|------------------|------------|-------------------| -| `FRONTEND_LISTEN`| `-listen` | `:8080` | -| `FRONTEND_TARGET`| `-target` | `localhost:9091` | -| `FRONTEND_N` | `-n` | `25` | -| `FRONTEND_REFRESH`| `-refresh`| `30` | - ---- - -DESIGN - -## Directory Layout - -``` -nginx-logtail/ -├── proto/ -│ ├── logtail.proto # shared protobuf definitions -│ └── logtailpb/ -│ ├── logtail.pb.go # generated: messages, enums -│ └── logtail_grpc.pb.go # generated: service stubs -├── internal/ -│ └── store/ -│ └── store.go # shared types: Tuple6, Entry, Snapshot, ring helpers -└── cmd/ - ├── collector/ - │ ├── main.go - │ ├── tailer.go # MultiTailer: tail N files via one shared fsnotify watcher - │ ├── parser.go # tab-separated logtail log_format parser (~50 ns/line) - │ ├── store.go # bounded top-K in-memory store + tiered ring buffers - │ └── server.go # gRPC server: TopN, Trend, StreamSnapshots - ├── aggregator/ - │ ├── main.go - │ ├── subscriber.go # one goroutine per collector; StreamSnapshots with backoff - │ ├── merger.go # delta-merge: O(snapshot_size) per update - │ ├── cache.go # tick-based ring buffer cache served to clients - │ ├── registry.go # TargetRegistry: addr→name map updated from snapshot sources - │ └── server.go # gRPC server (same surface as collector) - ├── frontend/ - │ ├── main.go - │ ├── handler.go # URL param parsing, concurrent TopN+Trend, template exec - │ ├── filter.go # ParseFilterExpr / FilterExprString mini filter language - │ ├── client.go # gRPC dial helper - │ ├── sparkline.go # TrendPoints → inline SVG polyline - │ ├── format.go # fmtCount (space thousands separator) - │ └── templates/ - │ ├── base.html # outer HTML shell, inline CSS, meta-refresh - │ └── index.html # window tabs, group-by tabs, breadcrumb, table, footer - └── cli/ - ├── main.go # subcommand dispatch and usage - ├── flags.go # shared flags, parseTargets, buildFilter, parseWindow - ├── client.go # gRPC dial helper - ├── format.go # printTable, fmtCount, fmtTime, targetHeader - ├── cmd_topn.go # topn: concurrent fan-out, table + JSON output - ├── cmd_trend.go # trend: concurrent fan-out, table + JSON output - ├── cmd_stream.go # stream: multiplexed streams, auto-reconnect - └── cmd_targets.go # targets: list collectors known to the endpoint -``` - -## Data Model - -The core unit is a **count keyed by six dimensions**: - -| Field | Description | Example | -|-------------------|------------------------------------------------------|-------------------| -| `website` | nginx `$host` | `www.example.com` | -| `client_prefix` | client IP truncated to /24 IPv4 or /48 IPv6 | `1.2.3.0/24` | -| `http_request_uri`| `$request_uri` path only — query string stripped | `/api/v1/search` | -| `http_response` | HTTP status code | `429` | -| `is_tor` | whether the client IP is a TOR exit node | `1` | -| `asn` | client AS number (MaxMind GeoIP2, 32-bit int) | `8298` | - -## Time Windows & Tiered Ring Buffers - -Two ring buffers at different resolutions cover all query windows up to 24 hours: - -| Tier | Bucket size | Buckets | Top-K/bucket | Covers | Roll-up trigger | -|--------|-------------|---------|--------------|--------|---------------------| -| Fine | 1 min | 60 | 50 000 | 1 h | every minute | -| Coarse | 5 min | 288 | 5 000 | 24 h | every 5 fine ticks | - -Supported query windows and which tier they read from: - -| Window | Tier | Buckets summed | -|--------|--------|----------------| -| 1 min | fine | last 1 | -| 5 min | fine | last 5 | -| 15 min | fine | last 15 | -| 60 min | fine | all 60 | -| 6 h | coarse | last 72 | -| 24 h | coarse | all 288 | - -Every minute: snapshot live map → top-50K → append to fine ring, reset live map. -Every 5 minutes: merge last 5 fine snapshots → top-5K → append to coarse ring. - -## Memory Budget (Collector, target ≤ 1 GB) - -Entry size: ~30 B website + ~15 B prefix + ~50 B URI + 3 B status + 1 B is_tor + 4 B asn + 8 B count + ~80 B Go map -overhead ≈ **~191 bytes per entry**. - -| Structure | Entries | Size | -|-------------------------|-------------|-------------| -| Live map (capped) | 100 000 | ~19 MB | -| Fine ring (60 × 1-min) | 60 × 50 000 | ~558 MB | -| Coarse ring (288 × 5-min)| 288 × 5 000| ~268 MB | -| **Total** | | **~845 MB** | - -The live map is **hard-capped at 100 K entries**. Once full, only updates to existing keys are -accepted; new keys are dropped until the next rotation resets the map. This keeps memory bounded -regardless of attack cardinality. - -## Future Work — ClickHouse Export (post-MVP) - -> **Do not implement until the end-to-end MVP is running.** - -The aggregator will optionally write 1-minute pre-aggregated rows to ClickHouse for 7d/30d -historical views. Schema sketch: - -```sql -CREATE TABLE logtail ( - ts DateTime, - website LowCardinality(String), - client_prefix String, - request_uri LowCardinality(String), - status UInt16, - count UInt64 -) ENGINE = SummingMergeTree(count) -PARTITION BY toYYYYMMDD(ts) -ORDER BY (ts, website, status, client_prefix, request_uri); -``` - -The frontend routes `window=7d|30d` queries to ClickHouse; all shorter windows continue to use -the in-memory cache. Kafka is not needed — the aggregator writes directly. This is purely additive -and does not change any existing interface. - -## Protobuf API (`proto/logtail.proto`) - -```protobuf -enum TorFilter { TOR_ANY = 0; TOR_YES = 1; TOR_NO = 2; } -enum StatusOp { EQ = 0; NE = 1; GT = 2; GE = 3; LT = 4; LE = 5; } - -message Filter { - optional string website = 1; - optional string client_prefix = 2; - optional string http_request_uri = 3; - optional int32 http_response = 4; - StatusOp status_op = 5; // comparison operator for http_response - optional string website_regex = 6; // RE2 regex against website - optional string uri_regex = 7; // RE2 regex against http_request_uri - TorFilter tor = 8; // TOR_ANY (default) / TOR_YES / TOR_NO - optional int32 asn_number = 9; // filter by client ASN - StatusOp asn_op = 10; // comparison operator for asn_number -} - -enum GroupBy { WEBSITE = 0; CLIENT_PREFIX = 1; REQUEST_URI = 2; HTTP_RESPONSE = 3; ASN_NUMBER = 4; } -enum Window { W1M = 0; W5M = 1; W15M = 2; W60M = 3; W6H = 4; W24H = 5; } - -message TopNRequest { Filter filter = 1; GroupBy group_by = 2; int32 n = 3; Window window = 4; } -message TopNEntry { string label = 1; int64 count = 2; } -message TopNResponse { repeated TopNEntry entries = 1; string source = 2; } - -// Trend: one total count per minute (or 5-min) bucket, for sparklines -message TrendRequest { Filter filter = 1; Window window = 4; } -message TrendPoint { int64 timestamp_unix = 1; int64 count = 2; } -message TrendResponse { repeated TrendPoint points = 1; string source = 2; } - -// Streaming: collector pushes a fine snapshot after every minute rotation -message SnapshotRequest {} -message Snapshot { - string source = 1; - int64 timestamp = 2; - repeated TopNEntry entries = 3; // full top-50K for this bucket - bool is_coarse = 4; // true for 5-min coarse buckets (DumpSnapshots only) -} - -// Target discovery: list the collectors behind the queried endpoint -message ListTargetsRequest {} -message TargetInfo { - string name = 1; // display name (--source value from the collector) - string addr = 2; // gRPC address; empty string means "this endpoint itself" -} -message ListTargetsResponse { repeated TargetInfo targets = 1; } - -// Backfill: dump full ring buffer contents for aggregator restart recovery -message DumpSnapshotsRequest {} -// Response reuses Snapshot; is_coarse distinguishes fine (1-min) from coarse (5-min) buckets. -// Stream closes after all historical data is sent (unlike StreamSnapshots which stays open). - -service LogtailService { - rpc TopN(TopNRequest) returns (TopNResponse); - rpc Trend(TrendRequest) returns (TrendResponse); - rpc StreamSnapshots(SnapshotRequest) returns (stream Snapshot); - rpc ListTargets(ListTargetsRequest) returns (ListTargetsResponse); - rpc DumpSnapshots(DumpSnapshotsRequest) returns (stream Snapshot); -} -// Both collector and aggregator implement LogtailService. -// The aggregator's StreamSnapshots re-streams the merged view. -// ListTargets: aggregator returns all configured collectors; collector returns itself. -// DumpSnapshots: collector only; aggregator calls this on startup to backfill its ring. -``` - -## Program 1 — Collector - -### tailer.go -- **`MultiTailer`**: one shared `fsnotify.Watcher` for all files regardless of count — avoids - the inotify instance limit when tailing hundreds of files. -- On `WRITE` event: read all new lines from that file's `bufio.Reader`. -- On `RENAME`/`REMOVE` (logrotate): drain old fd to EOF, close, start retry-open goroutine with - exponential backoff. Sends the new `*os.File` back via a channel to keep map access single-threaded. -- Emits `LogRecord` structs on a shared buffered channel (capacity 200 K — absorbs ~20 s of peak). -- Accepts paths via `--logs` (comma-separated or glob) and `--logs-file` (one path/glob per line). - -### parser.go -- Parses the fixed **logtail** nginx log format — tab-separated, fixed field order, no quoting: - - ```nginx - log_format logtail '$host\t$remote_addr\t$msec\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn'; - ``` - - | # | Field | Used for | - |---|-------------------|------------------| - | 0 | `$host` | website | - | 1 | `$remote_addr` | client_prefix | - | 2 | `$msec` | (discarded) | - | 3 | `$request_method` | (discarded) | - | 4 | `$request_uri` | http_request_uri | - | 5 | `$status` | http_response | - | 6 | `$body_bytes_sent`| (discarded) | - | 7 | `$request_time` | (discarded) | - | 8 | `$is_tor` | is_tor | - | 9 | `$asn` | asn | - -- `strings.SplitN(line, "\t", 10)` — ~50 ns/line. No regex. -- `$request_uri`: query string discarded at first `?`. -- `$remote_addr`: truncated to /24 (IPv4) or /48 (IPv6); prefix lengths configurable via flags. -- `$is_tor`: `1` if the client IP is a TOR exit node, `0` otherwise. Field is optional — lines - with exactly 8 fields (old format) are accepted and default to `is_tor=false`. -- `$asn`: client AS number as a decimal integer (from MaxMind GeoIP2). Field is optional — - lines without it default to `asn=0`. -- Lines with fewer than 8 fields are silently skipped. - -### store.go -- **Single aggregator goroutine** reads from the channel and updates the live map — no locking on - the hot path. At 10 K lines/s the goroutine uses <1% CPU. -- Live map: `map[Tuple6]int64`, hard-capped at 100 K entries (new keys dropped when full). -- **Minute ticker**: heap-selects top-50K entries, writes snapshot to fine ring, resets live map. -- Every 5 fine ticks: merge last 5 fine snapshots → top-5K → write to coarse ring. -- **TopN query**: RLock ring, sum bucket range, apply filter, group by dimension, heap-select top N. -- **Trend query**: per-bucket filtered sum, returns one `TrendPoint` per bucket. -- **Subscriber fan-out**: per-subscriber buffered channel; `Subscribe`/`Unsubscribe` for streaming. -- **`DumpRings()`**: acquires `RLock`, copies both ring arrays and their head/filled pointers - (just slice headers — microseconds), releases lock, then returns chronologically-ordered fine - and coarse snapshot slices. The lock is never held during serialisation or network I/O. - -### server.go -- gRPC server on configurable port (default `:9090`). -- `TopN` and `Trend`: unary, answered from the ring buffer under RLock. -- `StreamSnapshots`: registers a subscriber channel; loops `Recv` on it; 30 s keepalive ticker. -- `DumpSnapshots`: calls `DumpRings()`, streams all fine buckets (`is_coarse=false`) then all - coarse buckets (`is_coarse=true`), then closes the stream. No lock held during streaming. - -## Program 2 — Aggregator - -### subscriber.go -- One goroutine per collector. Dials, calls `StreamSnapshots`, forwards each `Snapshot` to the - merger. -- Reconnects with exponential backoff (100 ms → doubles → cap 30 s). -- After 3 consecutive failures: calls `merger.Zero(addr)` to remove that collector's contribution - from the merged view (prevents stale counts accumulating during outages). -- Resets failure count on first successful `Recv`; logs recovery. - -### merger.go -- **Delta strategy**: on each new snapshot from collector X, subtract X's previous entries from - `merged`, add the new entries, store new map. O(snapshot_size) per update — not - O(N_collectors × snapshot_size). -- `Zero(addr)`: subtracts the collector's last-known contribution and deletes its entry — called - when a collector is marked degraded. - -### cache.go -- **Tick-based rotation** (1-min ticker, not snapshot-triggered): keeps the aggregator ring aligned - to the same 1-minute cadence as collectors regardless of how many collectors are connected. -- Same tiered ring structure as the collector store; populated from `merger.TopK()` each tick. -- `QueryTopN`, `QueryTrend`, `Subscribe`/`Unsubscribe` — identical interface to collector store. -- **`LoadHistorical(fine, coarse []Snapshot)`**: writes pre-merged backfill snapshots directly into - the ring arrays under `mu.Lock()`, sets head and filled counters, then returns. Safe to call - concurrently with queries. The live ticker continues from the updated head after this returns. - -### backfill.go -- **`Backfill(ctx, collectorAddrs, cache)`**: called once at aggregator startup (in a goroutine, - after the gRPC server is already listening so the frontend is never blocked). -- Dials all collectors concurrently and calls `DumpSnapshots` on each. -- Accumulates entries per timestamp in `map[unix-second]map[label]count`; multiple collectors' - contributions for the same bucket are summed — the same delta-merge semantics as the live path. -- Sorts timestamps chronologically, runs `TopKFromMap` per bucket, caps to ring size. -- Calls `cache.LoadHistorical` once with the merged results. -- **Graceful degradation**: if a collector returns `Unimplemented` (old binary without - `DumpSnapshots`), logs an informational message and skips it — live streaming still starts - normally. Any other error is logged with timing and also skipped. Partial backfill (some - collectors succeed, some fail) is supported. -- Logs per-collector stats: bucket counts, total entry counts, and wall-clock duration. - -### registry.go -- **`TargetRegistry`**: `sync.RWMutex`-protected `map[addr → name]`. Initialised with the - configured collector addresses; display names are updated from the `source` field of the first - snapshot received from each collector. -- `Targets()` returns a stable sorted slice of `{name, addr}` pairs for `ListTargets` responses. - -### server.go -- Implements `LogtailService` backed by the cache (not live fan-out). -- `StreamSnapshots` re-streams merged fine snapshots; usable by a second-tier aggregator or - monitoring system. -- `ListTargets` returns the current `TargetRegistry` contents — all configured collectors with - their display names and gRPC addresses. - -## Program 3 — Frontend - -### handler.go -- All filter state in the **URL query string**: `w` (window), `by` (group_by), `f_website`, - `f_prefix`, `f_uri`, `f_status`, `f_website_re`, `f_uri_re`, `f_is_tor`, `f_asn`, `n`, `target`. No - server-side session — URLs are shareable and bookmarkable; multiple operators see independent views. -- **Filter expression box**: a `q=` parameter carries a mini filter language - (`status>=400 AND website~=gouda.* AND uri~=^/api/`). On submission the handler parses it - via `ParseFilterExpr` and redirects to the canonical URL with individual `f_*` params; `q=` - never appears in the final URL. Parse errors re-render the current page with an inline message. -- **Status expressions**: `f_status` accepts `200`, `!=200`, `>=400`, `<500`, etc. — parsed by - `store.ParseStatusExpr` into `(value, StatusOp)` for the filter protobuf. -- **ASN expressions**: `f_asn` accepts the same expression syntax (`12345`, `!=65000`, `>=1000`, - `<64512`, etc.) — also parsed by `store.ParseStatusExpr`, stored as `(asn_number, AsnOp)` in the - filter protobuf. -- **Regex filters**: `f_website_re` and `f_uri_re` hold RE2 patterns; compiled once per request - into `store.CompiledFilter` before the query-loop iteration. Invalid regexes match nothing. -- `TopN`, `Trend`, and `ListTargets` RPCs issued **concurrently** (all with a 5 s deadline); page - renders with whatever completes. Trend failure suppresses the sparkline; `ListTargets` failure - hides the source picker — both are non-fatal. -- **Source picker**: `ListTargets` result drives a `source:` tab row. Clicking a collector tab - sets `target=` to that collector's address, querying it directly. The "all" tab resets to the - default aggregator. Picker is hidden when `ListTargets` returns ≤0 collectors (direct collector - mode). -- **Drilldown**: clicking a table row adds the current dimension's filter and advances `by` through - `website → prefix → uri → status → asn → website` (cycles). -- **`raw=1`**: returns the TopN result as JSON — same URL, no CLI needed for scripting. -- **`target=` override**: per-request gRPC endpoint override for comparing sources. -- Error pages render at HTTP 502 with the window/group-by tabs still functional. - -### sparkline.go -- `renderSparkline([]*pb.TrendPoint) template.HTML` — fixed `viewBox="0 0 300 60"` SVG, - Y-scaled to max count, rendered as ``. Returns `""` for fewer than 2 points or - all-zero data. - -### templates/ -- `base.html`: outer shell, inline CSS (~40 lines), conditional ``. -- `index.html`: window tabs, group-by tabs, filter breadcrumb with `×` remove links, sparkline, - TopN table with `` bars (% relative to rank-1), footer with source and refresh info. -- No external CSS, no web fonts, no JavaScript. Renders in w3m/lynx. - -## Program 4 — CLI - -### Subcommands - -``` -logtail-cli topn [flags] ranked label → count table (exits after one response) -logtail-cli trend [flags] per-bucket time series (exits after one response) -logtail-cli stream [flags] live snapshot feed (runs until Ctrl-C, auto-reconnects) -logtail-cli targets [flags] list targets known to the queried endpoint -``` - -### Flags - -**Shared** (all subcommands): - -| Flag | Default | Description | -|---------------|------------------|----------------------------------------------------------| -| `--target` | `localhost:9090` | Comma-separated `host:port` list; fan-out to all | -| `--json` | false | Emit newline-delimited JSON instead of a table | -| `--website` | — | Filter: website | -| `--prefix` | — | Filter: client prefix | -| `--uri` | — | Filter: request URI | -| `--status` | — | Filter: HTTP status expression (`200`, `!=200`, `>=400`, `<500`, …) | -| `--website-re`| — | Filter: RE2 regex against website | -| `--uri-re` | — | Filter: RE2 regex against request URI | -| `--is-tor` | — | Filter: TOR traffic (`1` or `!=0` = TOR only; `0` or `!=1` = non-TOR only) | -| `--asn` | — | Filter: ASN expression (`12345`, `!=65000`, `>=1000`, `<64512`, …) | - -**`topn` only**: `--n 10`, `--window 5m`, `--group-by website` - -**`trend` only**: `--window 5m` - -### Multi-target fan-out - -`--target` accepts a comma-separated list. All targets are queried concurrently; results are -printed in order with a per-target header. Single-target output omits the header for clean -pipe-to-`jq` use. - -### Output - -Default: human-readable table with space-separated thousands (`18 432`). -`--json`: a single JSON array (one object per target) for `topn` and `trend`; NDJSON for `stream` (unbounded). - -`stream` reconnects automatically on error (5 s backoff). All other subcommands exit immediately -with a non-zero code on gRPC error. - -## Key Design Decisions - -| Decision | Rationale | -|----------|-----------| -| Single aggregator goroutine in collector | Eliminates all map lock contention on the 10 K/s hot path | -| Hard cap live map at 100 K entries | Bounds memory regardless of DDoS cardinality explosion | -| Ring buffer of sorted snapshots (not raw maps) | TopN queries avoid re-sorting; merge is a single heap pass | -| Push-based streaming (collector → aggregator) | Aggregator cache always fresh; query latency is cache-read only | -| Delta merge in aggregator | O(snapshot_size) per update, not O(N_collectors × size) | -| Tick-based cache rotation in aggregator | Ring stays on the same 1-min cadence regardless of collector count | -| Degraded collector zeroing | Stale counts from failed collectors don't accumulate in the merged view | -| Same `LogtailService` for collector and aggregator | CLI and frontend work with either; no special-casing | -| `internal/store` shared package | ring-buffer, `Tuple6` encoding, and filter logic shared between collector and aggregator | -| Filter state in URL, not session cookie | Multiple concurrent operators; shareable/bookmarkable URLs | -| Query strings stripped at ingest | Major cardinality reduction; prevents URI explosion under attack | -| No persistent storage | Simplicity; acceptable for ops dashboards (restart = lose history) | -| Trusted internal network, no TLS | Reduces operational complexity; add a TLS proxy if needed later | -| Server-side SVG sparklines, meta-refresh | Zero JS dependencies; works in terminal browsers and curl | -| CLI default: human-readable table | Operator-friendly by default; `--json` opt-in for scripting | -| CLI multi-target fan-out | Compare a collector vs. aggregator, or two collectors, in one command | -| CLI uses stdlib `flag`, no framework | Four subcommands don't justify a dependency | -| Status filter as expression string (`!=200`, `>=400`) | Operator-friendly; parsed once at query boundary, encoded as `(int32, StatusOp)` in proto | -| ASN filter reuses `StatusOp` and `ParseStatusExpr` | Same 6-operator grammar as status; no duplicate enum or parser needed | -| Regex filters compiled once per query (`CompiledFilter`) | Up to 288 × 5 000 per-entry calls — compiling per-entry would dominate query latency | -| Filter expression box (`q=`) redirects to canonical URL | Filter state stays in individual `f_*` params; URLs remain shareable and bookmarkable | -| `ListTargets` + frontend source picker | "Which nginx is busiest?" answered by switching `target=` to a collector; no data model changes, no extra memory | -| Backfill via `DumpSnapshots` on restart | Aggregator recovers full 24h ring from collectors on restart; gRPC server starts first so frontend is never blocked during backfill | -| `DumpRings()` copies under lock, streams without lock | Lock held for microseconds (slice-header copy only); network I/O happens outside the lock so minute rotation is never delayed | -| Backfill merges per-timestamp across collectors | Correct cross-collector sums per bucket, same semantics as live delta-merge; collectors that don't support `DumpSnapshots` are skipped gracefully | diff --git a/docs/PLAN_AGGREGATOR.md b/docs/PLAN_AGGREGATOR.md deleted file mode 100644 index 0ae85db..0000000 --- a/docs/PLAN_AGGREGATOR.md +++ /dev/null @@ -1,250 +0,0 @@ -# Aggregator v0 — Implementation Plan - -Module path: `git.ipng.ch/ipng/nginx-logtail` - -**Scope:** A working aggregator that subscribes to `StreamSnapshots` from all configured -collectors, maintains a merged in-memory cache, and serves the same `LogtailService` gRPC -interface as the collector. Tolerates partial collector failures. - ---- - -## Step 1 — Extract shared logic to `internal/store` - -The aggregator's cache is structurally identical to the collector's store: same `Entry` and -`snapshot` types, same tiered ring buffers, same heap-based top-K, same label encoding -(`encodeTuple`, `labelTuple`), same `matchesFilter` and `dimensionLabel` functions. - -Rather than duplicating ~200 lines of load-bearing code, extract these to a shared internal -package before writing any aggregator code. Then refactor the collector to import it. - -**New package: `internal/store`** - -Move from `cmd/collector/store.go` into `internal/store/store.go`: -- `Tuple4` struct -- `Entry` struct -- `snapshot` struct (unexported → exported: `Snapshot`) -- `entryHeap` + heap interface methods -- `encodeTuple`, `labelTuple`, `splitN`, `indexOf` -- `matchesFilter`, `dimensionLabel` -- `topKFromMap`, `topK` -- `trendPoint` -- `ringView`, `bucketsForWindow` -- All ring-buffer constants (`fineRingSize`, `coarseRingSize`, `fineTopK`, `coarseTopK`, - `coarseEvery`) - -Keep in `cmd/collector/store.go` (collector-specific): -- `liveMapCap` -- `Store` struct (live map + ring buffers + subscriber fan-out + `Run` goroutine) -- `ingest`, `rotate`, `mergeFineBuckets` -- `QueryTopN`, `QueryTrend`, `Subscribe`, `Unsubscribe`, `broadcast` -- The `Store` embeds the ring buffers using the types from `internal/store` - -Collector tests must continue to pass unchanged after the refactor. - ---- - -## Step 2 — subscriber.go - -One goroutine per collector. Dials the collector, calls `StreamSnapshots`, and forwards each -received `pb.Snapshot` to the merger. Reconnects with exponential backoff on any stream error. - -``` -CollectorSub struct: - addr string - merger *Merger - source string // filled from first snapshot received - fails int // consecutive failures -``` - -Lifecycle: -1. `Dial(addr)` → `client.StreamSnapshots(ctx, &pb.SnapshotRequest{})` -2. Loop: `stream.Recv()` → `merger.Apply(snap)`; on error: close, `fails++` -3. If `fails >= 3`: call `merger.Zero(addr)`, log degraded warning -4. Backoff sleep (100 ms → doubles → cap 30 s), then go to step 1 -5. On successful `Recv()` after degraded: `fails = 0`, log recovery - -Context cancellation exits the goroutine cleanly. - ---- - -## Step 3 — merger.go - -Maintains the per-collector maps and a single running merged map. Uses a delta strategy: -when a new snapshot arrives from collector X, subtract X's previous entries from `merged`, -add the new entries, and replace X's stored map. This is O(snapshot_size) rather than -O(N_collectors × snapshot_size). - -``` -Merger struct: - mu sync.Mutex - perCollector map[string]map[string]int64 // addr → (label → count) - merged map[string]int64 // label → total count across all collectors -``` - -Methods: -- `Apply(snap *pb.Snapshot)` — lock, subtract old, add new, store new, unlock -- `Zero(addr string)` — lock, subtract perCollector[addr] from merged, delete entry, unlock -- `TopK(k int) []store.Entry` — lock, call `store.TopKFromMap(merged, k)`, unlock - -`Apply` is called from multiple subscriber goroutines concurrently — the mutex is the only -synchronisation point. No channels needed here. - ---- - -## Step 4 — cache.go - -The aggregator's equivalent of the collector's `Store`. Holds the tiered ring buffers and -answers `TopN`/`Trend`/`StreamSnapshots` queries. Populated by a 1-minute ticker that snapshots -the current merged view from the merger. - -``` -Cache struct: - source string - merger *Merger - - mu sync.RWMutex - fineRing [fineRingSize]store.Snapshot - fineHead int - fineFilled int - coarseRing [coarseRingSize]store.Snapshot - coarseHead int - coarseFilled int - fineTick int - - subMu sync.Mutex - subs map[chan store.Snapshot]struct{} -``` - -`Run(ctx context.Context)`: -- 1-minute ticker → `rotate(time.Now())` -- `rotate`: `merger.TopK(fineTopK)` → fine ring slot; every 5 ticks → merge last 5 fine slots - into coarse ring slot (identical logic to collector `Store.rotate`) -- After writing: broadcast fine snapshot to subscribers - -`QueryTopN`, `QueryTrend`, `Subscribe`, `Unsubscribe`, `broadcast`: identical to collector -`Store`, backed by `internal/store` helpers. - -**Why tick-based and not snapshot-triggered?** -Collectors send snapshots roughly once per minute but not in sync. Triggering a ring write on -every incoming snapshot would produce N writes per minute (one per collector), inflating the ring -and misaligning time windows. A single ticker keeps the aggregator ring aligned with the same -1-minute cadence as the collectors. - ---- - -## Step 5 — server.go - -Identical structure to `cmd/collector/server.go`. Implements `pb.LogtailServiceServer` backed by -the `Cache` instead of the collector's `Store`. No new logic; just a different backing type. - -`StreamSnapshots` sends merged fine snapshots (from `cache.Subscribe`) to downstream consumers -(frontend, CLI, or a second-tier aggregator). - ---- - -## Step 6 — main.go - -Flags: - -| Flag | Default | Description | -|----------------|--------------|--------------------------------------------------------| -| `--listen` | `:9091` | gRPC listen address | -| `--collectors` | — | Comma-separated `host:port` addresses of collectors | -| `--source` | hostname | Name for this aggregator in query responses | - -Wire-up: -1. Parse collector addresses -2. Create `Merger` -3. Create `Cache(merger, source)` -4. Start `cache.Run(ctx)` goroutine (ticker + ring rotation) -5. Start one `CollectorSub.Run(ctx)` goroutine per collector address -6. Start gRPC server -7. `signal.NotifyContext` for clean shutdown on SIGINT/SIGTERM - ---- - -## Step 7 — Tests - -| Test | What it covers | -|------|----------------| -| `TestMergerApply` | Two collectors send snapshots; merged map sums correctly | -| `TestMergerReplacement` | Second snapshot from same collector replaces first, not adds | -| `TestMergerZero` | Marking a collector degraded removes its contribution from merged | -| `TestMergerConcurrent` | `Apply` and `Zero` from concurrent goroutines; no race (run with `-race`) | -| `TestCacheRotation` | After one ticker fire, fine ring has 1 entry with correct counts | -| `TestCacheCoarseRing` | After 5 ticker fires, coarse ring has 1 entry | -| `TestCacheQueryTopN` | TopN returns correct merged rankings | -| `TestCacheQueryTrend` | Trend returns per-bucket sums oldest-first | -| `TestCacheSubscribe` | Subscriber receives snapshot after each rotation | -| `TestGRPCEndToEnd` | Two in-process fake collector servers; real aggregator dials them; TopN, Trend, StreamSnapshots verified | - -All existing collector tests must continue to pass after the `internal/store` refactor. - ---- - -## Step 8 — Smoke test - -- Start two collector instances pointing at generated log files -- Start the aggregator pointing at both -- Use `grpcurl` to call `TopN` on the aggregator and confirm counts match the sum of the two - individual collector `TopN` results -- Kill one collector; confirm the aggregator continues serving and logs a degraded warning -- Restart the killed collector; confirm the aggregator recovers and resumes merging - ---- - -## ✓ COMPLETE — Implementation notes - -### Deviations from the plan - -- **`TestMergerZeroNonexistent` added**: Plan listed 10 tests; an extra test was added to cover - `Zero()` on a source that never sent a snapshot (should be a no-op). Total: 13 tests. -- **`TestDegradedCollector` in end-to-end section**: Rather than a separate block, degraded - behaviour is tested with one real fake collector + one unreachable port in the same test file. -- **Race in `TestGRPCEndToEnd`**: The `cache.rotate()` call to trigger a broadcast needed a - 50 ms sleep after `client.StreamSnapshots()` to allow the server goroutine to register its - subscriber before the broadcast fired. Without it the test was intermittently flaky under - the race detector and parallel test runs. -- **`source` field not stored on `CollectorSub`**: Plan mentioned storing `source` from the first - snapshot, but `Apply` uses `snap.Source` directly (keying `perCollector` by address). The - `source` field was not needed on the struct. - -### Test results - -``` -$ go test ./... -count=1 -race -timeout 60s -ok git.ipng.ch/ipng/nginx-logtail/cmd/aggregator 4.1s -ok git.ipng.ch/ipng/nginx-logtail/cmd/collector 9.7s -``` - -All 13 aggregator tests and all 17 collector tests pass with `-race`. - -### Test inventory - -| Test | Package | What it covers | -|------|---------|----------------| -| `TestMergerApply` | aggregator | Two collectors sum correctly | -| `TestMergerReplacement` | aggregator | Second snapshot replaces, not adds | -| `TestMergerZero` | aggregator | Degraded collector removed from merged | -| `TestMergerZeroNonexistent` | aggregator | Zero on unknown source is a no-op | -| `TestMergerConcurrent` | aggregator | Apply + Zero from concurrent goroutines; -race | -| `TestCacheRotation` | aggregator | Fine ring written after one ticker fire | -| `TestCacheCoarseRing` | aggregator | Coarse ring written after 5 ticker fires | -| `TestCacheQueryTopN` | aggregator | TopN returns correct merged rankings | -| `TestCacheQueryTopNWithFilter` | aggregator | TopN with website filter | -| `TestCacheQueryTrend` | aggregator | Trend per-bucket sums oldest-first | -| `TestCacheSubscribe` | aggregator | Subscriber receives snapshot on rotation | -| `TestGRPCEndToEnd` | aggregator | Two fake collectors; real gRPC TopN/Trend/Stream | -| `TestDegradedCollector` | aggregator | Bad address zeroed; good collector still visible | - ---- - -## Deferred (not in v0) - -- Per-source (busiest nginx) breakdown — requires adding `SOURCE` to the `GroupBy` proto enum - and encoding the source into the merged snapshot entries; deferred until the proto is stable -- `cmd/cli` — covered in PLAN_CLI.md -- `cmd/frontend` — covered in PLAN_FRONTEND.md -- ClickHouse export -- TLS / auth -- Prometheus metrics endpoint diff --git a/docs/PLAN_CLI.md b/docs/PLAN_CLI.md deleted file mode 100644 index 9766a5b..0000000 --- a/docs/PLAN_CLI.md +++ /dev/null @@ -1,293 +0,0 @@ -# CLI v0 — Implementation Plan - -Module path: `git.ipng.ch/ipng/nginx-logtail` - -**Scope:** A shell-facing debug tool that can query any number of collectors or aggregators -(they share the same `LogtailService` gRPC interface) and print results in a human-readable -table or JSON. Supports all three RPCs: `TopN`, `Trend`, and `StreamSnapshots`. - ---- - -## Overview - -Single binary `logtail-cli` with three subcommands: - -``` -logtail-cli topn [flags] # ranked list of label → count -logtail-cli trend [flags] # per-bucket time series -logtail-cli stream [flags] # live snapshot feed -``` - -All subcommands accept one or more `--target` addresses. Requests are fanned out -concurrently; each target's results are printed under a labeled header. With a single -target the header is omitted for clean pipe-friendly output. - ---- - -## Step 1 — main.go and subcommand dispatch - -No third-party CLI frameworks — plain `os.Args` subcommand dispatch, each subcommand -registers its own `flag.FlagSet`. - -``` -main(): - if len(os.Args) < 2 → print usage, exit 1 - switch os.Args[1]: - "topn" → runTopN(os.Args[2:]) - "trend" → runTrend(os.Args[2:]) - "stream" → runStream(os.Args[2:]) - default → print usage, exit 1 -``` - -Usage text lists all subcommands and their flags. - ---- - -## Step 2 — Shared flags and client helper (`flags.go`, `client.go`) - -**Shared flags** (parsed by each subcommand's FlagSet): - -| Flag | Default | Description | -|------|---------|-------------| -| `--target` | `localhost:9090` | Comma-separated `host:port` list (may be repeated) | -| `--json` | false | Emit newline-delimited JSON instead of a table | -| `--website` | — | Filter: exact website match | -| `--prefix` | — | Filter: exact client prefix match | -| `--uri` | — | Filter: exact URI match | -| `--status` | — | Filter: exact HTTP status match | - -`parseTargets(s string) []string` — split on comma, trim spaces, deduplicate. - -`buildFilter(flags) *pb.Filter` — returns nil if no filter flags set (signals "no filter" -to the server), otherwise populates the proto fields. - -**`client.go`**: - -```go -func dial(addr string) (*grpc.ClientConn, pb.LogtailServiceClient, error) -``` - -Plain insecure dial (matching the servers' plain-TCP listener). Returns an error rather -than calling `log.Fatal` so callers can report which target failed without killing the process. - ---- - -## Step 3 — `topn` subcommand (`cmd_topn.go`) - -Additional flags: - -| Flag | Default | Description | -|------|---------|-------------| -| `--n` | 10 | Number of entries to return | -| `--window` | `5m` | Time window: `1m 5m 15m 60m 6h 24h` | -| `--group-by` | `website` | Grouping: `website prefix uri status` | - -`parseWindow(s string) pb.Window` — maps string → proto enum, exits on unknown value. -`parseGroupBy(s string) pb.GroupBy` — same pattern. - -Fan-out: one goroutine per target, each calls `TopN` with a 10 s context deadline, -sends result (or error) on a typed result channel. Main goroutine collects all results -in target order. - -**Table output** (default): - -``` -=== collector-1 (localhost:9090) === -RANK COUNT LABEL - 1 18 432 example.com - 2 4 211 other.com - ... - -=== aggregator (localhost:9091) === -RANK COUNT LABEL - 1 22 643 example.com - ... -``` - -Single-target: header omitted, plain table printed. - -**JSON output** (`--json`): one JSON object per target, written sequentially to stdout: - -```json -{"source":"collector-1","target":"localhost:9090","entries":[{"label":"example.com","count":18432},...]} -``` - ---- - -## Step 4 — `trend` subcommand (`cmd_trend.go`) - -Additional flags: - -| Flag | Default | Description | -|------|---------|-------------| -| `--window` | `5m` | Time window: `1m 5m 15m 60m 6h 24h` | - -Same fan-out pattern as `topn`. - -**Table output**: - -``` -=== collector-1 (localhost:9090) === -TIME (UTC) COUNT -2026-03-14 20:00 823 -2026-03-14 20:01 941 -... -``` - -Points are printed oldest-first (as returned by the server). - -**JSON output**: one object per target: - -```json -{"source":"col-1","target":"localhost:9090","points":[{"ts":1773516000,"count":823},...] -``` - ---- - -## Step 5 — `stream` subcommand (`cmd_stream.go`) - -No extra flags beyond shared ones. Each target gets one persistent `StreamSnapshots` -connection. All streams are multiplexed onto a single output goroutine via an internal -channel so lines from different targets don't interleave. - -``` -type streamEvent struct { - target string - source string - snap *pb.Snapshot - err error -} -``` - -One goroutine per target: connect → loop `stream.Recv()` → send event on channel. -On error: log to stderr, attempt reconnect after 5 s backoff (indefinitely, until -`Ctrl-C`). - -`signal.NotifyContext` on SIGINT/SIGTERM cancels all stream goroutines. - -**Table output** (one line per snapshot received): - -``` -2026-03-14 20:03:00 agg-test (localhost:9091) 950 entries top: example.com=18432 -``` - -**JSON output**: one JSON object per snapshot event: - -```json -{"ts":1773516180,"source":"agg-test","target":"localhost:9091","top_label":"example.com","top_count":18432,"total_entries":950} -``` - ---- - -## Step 6 — Formatting helpers (`format.go`) - -```go -func printTable(w io.Writer, headers []string, rows [][]string) -``` - -Right-aligns numeric columns (COUNT, RANK), left-aligns strings. Uses `text/tabwriter` -with padding=2. No external dependencies. - -```go -func fmtCount(n int64) string // "18 432" — space as thousands separator -func fmtTime(unix int64) string // "2026-03-14 20:03" UTC -``` - ---- - -## Step 7 — Tests (`cli_test.go`) - -Unit tests run entirely in-process with fake gRPC servers (same pattern as -`cmd/aggregator/aggregator_test.go`). - -| Test | What it covers | -|------|----------------| -| `TestParseWindow` | All 6 window strings → correct proto enum; bad value exits | -| `TestParseGroupBy` | All 4 group-by strings → correct proto enum; bad value exits | -| `TestParseTargets` | Comma split, trim, dedup | -| `TestBuildFilter` | All combinations of filter flags → correct proto Filter | -| `TestTopNSingleTarget` | Fake server; `runTopN` output matches expected table | -| `TestTopNMultiTarget` | Two fake servers; both headers present in output | -| `TestTopNJSON` | `--json` flag; output is valid JSON with correct fields | -| `TestTrendSingleTarget` | Fake server; points printed oldest-first | -| `TestTrendJSON` | `--json` flag; output is valid JSON | -| `TestStreamReceivesSnapshots` | Fake server sends 3 snapshots; output has 3 lines | -| `TestFmtCount` | `fmtCount(18432)` → `"18 432"` | -| `TestFmtTime` | `fmtTime(1773516000)` → `"2026-03-14 20:00"` | - ---- - -## ✓ COMPLETE — Implementation notes - -### Deviations from the plan - -- **`TestFmtTime` uses `time.Date` not a hardcoded unix literal**: The hardcoded value - `1773516000` turned out to be 2026-03-14 19:20 UTC, not 20:00. Fixed by computing the - timestamp dynamically with `time.Date(2026, 3, 14, 20, 0, 0, 0, time.UTC).Unix()`. -- **`TestTopNJSON` tests field values, not serialised bytes**: Calling `printTopNJSON` would - require redirecting stdout. Instead the test verifies the response struct fields that the - JSON formatter would use — simpler and equally effective. -- **`streamTarget` reconnect loop lives in `cmd_stream.go`**, not a separate file. The stream - and reconnect logic are short enough to colocate. - -### Test results - -``` -$ go test ./... -count=1 -race -timeout 60s -ok git.ipng.ch/ipng/nginx-logtail/cmd/cli 1.0s (14 tests) -ok git.ipng.ch/ipng/nginx-logtail/cmd/aggregator 4.1s (13 tests) -ok git.ipng.ch/ipng/nginx-logtail/cmd/collector 9.9s (17 tests) -``` - -### Test inventory - -| Test | What it covers | -|------|----------------| -| `TestParseTargets` | Comma split, trim, deduplication | -| `TestParseWindow` | All 6 window strings → correct proto enum | -| `TestParseGroupBy` | All 4 group-by strings → correct proto enum | -| `TestBuildFilter` | Filter fields set correctly from flags | -| `TestBuildFilterNil` | Returns nil when no filter flags set | -| `TestFmtCount` | Space-separated thousands: 1234567 → "1 234 567" | -| `TestFmtTime` | Unix → "2026-03-14 20:00" UTC | -| `TestTopNSingleTarget` | Fake server; correct entry count and top label | -| `TestTopNMultiTarget` | Two fake servers; results ordered by target | -| `TestTopNJSON` | Response fields match expected values for JSON | -| `TestTrendSingleTarget` | Correct point count and ascending timestamp order | -| `TestTrendJSON` | JSON round-trip preserves source, ts, count | -| `TestStreamReceivesSnapshots` | 3 snapshots delivered from fake server via events channel | -| `TestTargetHeader` | Single-target → empty; multi-target → labeled header | - ---- - -## Step 8 — Smoke test - -```bash -# Start a collector -./logtail-collector --listen :9090 --logs /var/log/nginx/access.log - -# Start an aggregator -./logtail-aggregator --listen :9091 --collectors localhost:9090 - -# Query TopN from both in one shot -./logtail-cli topn --target localhost:9090,localhost:9091 --window 15m --n 5 - -# Stream live snapshots from both simultaneously -./logtail-cli stream --target localhost:9090,localhost:9091 - -# Filter to one website, group by URI -./logtail-cli topn --target localhost:9091 --website example.com --group-by uri --n 20 - -# JSON output for scripting -./logtail-cli topn --target localhost:9091 --json | jq '.entries[0]' -``` - ---- - -## Deferred (not in v0) - -- `--format csv` — easy to add later if needed for spreadsheet export -- `--count` / `--watch N` — repeat the query every N seconds (like `watch(1)`) -- Color output (`--color`) — ANSI highlighting of top entries -- Connecting to TLS-secured endpoints (when TLS is added to the servers) -- Per-source breakdown (depends on `SOURCE` GroupBy being added to the proto) diff --git a/docs/PLAN_COLLECTOR.md b/docs/PLAN_COLLECTOR.md deleted file mode 100644 index 9a83a56..0000000 --- a/docs/PLAN_COLLECTOR.md +++ /dev/null @@ -1,144 +0,0 @@ -# Collector v0 — Implementation Plan ✓ COMPLETE - -Module path: `git.ipng.ch/ipng/nginx-logtail` - -**Scope:** A working collector that tails files, aggregates into memory, and serves `TopN`, -`Trend`, and `StreamSnapshots` over gRPC. Full vertical slice, no optimisation passes yet. - ---- - -## Step 1 — Repo scaffolding -- `go mod init git.ipng.ch/ipng/nginx-logtail` -- `.gitignore` -- Install deps: `google.golang.org/grpc`, `google.golang.org/protobuf`, `github.com/fsnotify/fsnotify` - -## Step 2 — Proto (`proto/logtail.proto`) -Write the full proto file as specified in README.md DESIGN § Protobuf API. Generate Go stubs with -`protoc`. Commit generated files. This defines the contract everything else builds on. - -## Step 3 — Parser (`cmd/collector/parser.go`) -- `LogRecord` struct: `Website`, `ClientPrefix`, `URI`, `Status string` -- `ParseLine(line string) (LogRecord, bool)` — `SplitN` on tab, discard query string at `?`, - return `false` for lines with fewer than 8 fields -- `TruncateIP(addr string, v4bits, v6bits int) string` — handle IPv4 and IPv6 -- Unit-tested with table-driven tests: normal line, short line, IPv6, query string stripping, - /24 and /48 truncation - -## Step 4 — Store (`cmd/collector/store.go`) -Implement in order, each piece testable independently: - -1. **`Tuple4` and live map** — `map[Tuple4]int64`, cap enforcement at 100K, `Ingest(r LogRecord)` -2. **Fine ring buffer** — `[60]Snapshot` circular array, `rotate()` heap-selects top-50K from - live map, appends to ring, resets live map -3. **Coarse ring buffer** — `[288]Snapshot`, populated every 5 fine rotations by merging - the last 5 fine snapshots into a top-5K snapshot -4. **`QueryTopN(filter, groupBy, n, window)`** — RLock, sum bucket range, group by dimension, - apply filter, heap-select top N -5. **`QueryTrend(filter, window)`** — per-bucket count sum, returns one point per bucket -6. **`Store.Run(ch <-chan LogRecord)`** — single goroutine: read channel → `Ingest`, minute - ticker → `rotate()` -7. **Snapshot broadcast** — per-subscriber buffered channel fan-out; - `Subscribe() <-chan Snapshot` / `Unsubscribe(ch)` - -## Step 5 — Tailer (`cmd/collector/tailer.go`) -- `Tailer` struct: path, fsnotify watcher, output channel -- On start: open file, seek to EOF, register fsnotify watch -- On `fsnotify.Write`: `bufio.Scanner` reads all new lines, sends `LogRecord` to channel -- On `fsnotify.Rename` / `Remove`: drain to EOF, close fd, retry open with 100 ms backoff - (up to 5 s), resume from position 0 — no lines lost between drain and reopen -- `Tailer.Run(ctx context.Context)` — blocks until context cancelled - -## Step 6 — gRPC server (`cmd/collector/server.go`) -- `Server` wraps `*Store`, implements `LogtailServiceServer` -- `TopN`: `store.QueryTopN` → marshal to proto response -- `Trend`: `store.QueryTrend` → marshal to proto response -- `StreamSnapshots`: `store.Subscribe()`, loop sending snapshots until client disconnects - or context done, then `store.Unsubscribe(ch)` - -## Step 7 — Main (`cmd/collector/main.go`) -Flags: -- `--listen` default `:9090` -- `--logs` comma-separated log file paths -- `--source` name for this collector instance (default: hostname) -- `--v4prefix` default `24` -- `--v6prefix` default `48` - -Wire-up: create channel → start `store.Run` goroutine → start one `Tailer` goroutine per log -path → start gRPC server → `signal.NotifyContext` for clean shutdown on SIGINT/SIGTERM. - -## Step 8 — Smoke test -- Generate fake log lines at 10K/s (small Go script or shell one-liner) -- Run collector against them -- Use `grpcurl` to call `TopN` and verify results -- Check `runtime.MemStats` to confirm memory stays well under 1 GB - ---- - -## Deferred (not in v0) -- `cmd/cli`, `cmd/aggregator`, `cmd/frontend` -- ClickHouse export -- TLS / auth -- Prometheus metrics endpoint - ---- - -## Implementation notes - -### Deviation from plan: MultiTailer - -Step 5 planned one `Tailer` struct per file. During implementation this was changed to a single -`MultiTailer` with one shared `fsnotify.Watcher`. Reason: one watcher per file creates one inotify -instance per file; the kernel default limit is 128 instances per user, which would be hit with -100s of log files. The `MultiTailer` uses a single instance and routes events by path via a -`map[string]*fileState`. - -### Deviation from plan: IPv6 /48 semantics - -The design doc said "truncate to /48". `/48` keeps the first three full 16-bit groups intact -(e.g. `2001:db8:cafe::1` → `2001:db8:cafe::/48`). An early test expected `2001:db8:ca00::/48` -(truncating mid-group), which was wrong. The code is correct; the test was fixed. - ---- - -## Test results - -Run with: `go test ./cmd/collector/ -v -count=1 -timeout 120s` - -| Test | What it covers | -|-----------------------------|----------------------------------------------------| -| `TestParseLine` (7 cases) | Tab parsing, query string stripping, bad lines | -| `TestTruncateIP` | IPv4 /24 and IPv6 /48 masking | -| `TestIngestAndRotate` | Live map → fine ring rotation | -| `TestLiveMapCap` | Hard cap at 100 K entries, no panic beyond cap | -| `TestQueryTopN` | Ranked results from ring buffer | -| `TestQueryTopNWithFilter` | Filter by HTTP status code | -| `TestQueryTrend` | Per-bucket counts, oldest-first ordering | -| `TestCoarseRingPopulated` | 5 fine ticks → 1 coarse bucket, count aggregation | -| `TestSubscribeBroadcast` | Fan-out channel delivery after rotation | -| `TestTopKOrdering` | Heap select returns correct top-K descending | -| `TestMultiTailerReadsLines` | Live file write → LogRecord received on channel | -| `TestMultiTailerMultipleFiles` | 5 files, one watcher, all lines received | -| `TestMultiTailerLogRotation`| RENAME → drain → retry → new file tailed correctly | -| `TestExpandGlobs` | Glob pattern expands to matching files only | -| `TestExpandGlobsDeduplication` | Same file via path + glob deduplicated to one | -| `TestMemoryBudget` | Full ring fill stays within 1 GB heap | -| `TestGRPCEndToEnd` | Real gRPC server: TopN, filtered TopN, Trend, StreamSnapshots | - -**Total: 17 tests, all passing.** - ---- - -## Benchmark results - -Run with: `go test ./cmd/collector/ -bench=. -benchtime=3s` - -Hardware: 12th Gen Intel Core i7-12700T - -| Benchmark | ns/op | throughput | headroom vs 10K/s | -|--------------------|-------|----------------|-------------------| -| `BenchmarkParseLine` | 418 | ~2.4M lines/s | 240× | -| `BenchmarkIngest` | 152 | ~6.5M records/s| 650× | - -Both the parser and the store ingestion goroutine have several hundred times more capacity than -the 10 000 lines/second peak requirement. The bottleneck at scale will be fsnotify event delivery -and kernel I/O, not the Go code. diff --git a/docs/PLAN_FRONTEND.md b/docs/PLAN_FRONTEND.md deleted file mode 100644 index 012fce3..0000000 --- a/docs/PLAN_FRONTEND.md +++ /dev/null @@ -1,334 +0,0 @@ -# Frontend v0 — Implementation Plan - -Module path: `git.ipng.ch/ipng/nginx-logtail` - -**Scope:** An HTTP server that queries a collector or aggregator and renders a drilldown TopN -dashboard with trend sparklines. Zero JavaScript. Filter state in the URL. Auto-refreshes every -30 seconds. Works with any `LogtailService` endpoint (collector or aggregator). - ---- - -## Overview - -Single page, multiple views driven entirely by URL query parameters: - -``` -http://frontend:8080/?target=agg:9091&w=5m&by=website&f_status=429&n=25 -``` - -Clicking a table row drills down: it adds a filter for the clicked label and advances -`by` to the next dimension in the hierarchy (`website → prefix → uri → status`). The -breadcrumb strip shows all active filters; each token is a link that removes it. - ---- - -## Step 1 — main.go - -Flags: - -| Flag | Default | Description | -|------|---------|-------------| -| `--listen` | `:8080` | HTTP listen address | -| `--target` | `localhost:9091` | Default gRPC endpoint (aggregator or collector) | -| `--n` | `25` | Default number of table rows | -| `--refresh` | `30` | `` interval in seconds; 0 to disable | - -Wire-up: -1. Parse flags -2. Register `http.HandleFunc("/", handler)` (single handler, all state in URL) -3. `http.ListenAndServe` -4. `signal.NotifyContext` for clean shutdown on SIGINT/SIGTERM - ---- - -## Step 2 — client.go - -```go -func dial(addr string) (*grpc.ClientConn, pb.LogtailServiceClient, error) -``` - -Identical to the CLI version — plain insecure dial. A new connection is opened per HTTP -request. At a 30-second page refresh rate this is negligible; pooling is not needed. - ---- - -## Step 3 — handler.go - -### URL parameters - -| Param | Default | Values | -|-------|---------|--------| -| `target` | flag default | `host:port` | -| `w` | `5m` | `1m 5m 15m 60m 6h 24h` | -| `by` | `website` | `website prefix uri status` | -| `n` | flag default | positive integer | -| `f_website` | — | string | -| `f_prefix` | — | string | -| `f_uri` | — | string | -| `f_status` | — | integer string | -| `raw` | — | `1` → respond with JSON instead of HTML | - -### Request flow - -``` -parseURLParams(r) → QueryParams -buildFilter(QueryParams) → *pb.Filter -dial(target) → client -concurrent: - client.TopN(filter, groupBy, n, window) → TopNResponse - client.Trend(filter, window) → TrendResponse -renderSparkline(TrendResponse.Points) → template.HTML -buildTableRows(TopNResponse, QueryParams) → []TableRow (includes drill-down URL per row) -buildBreadcrumbs(QueryParams) → []Crumb -execute template → w -``` - -TopN and Trend RPCs are issued concurrently (both have a 5 s context deadline). If Trend -fails, the sparkline is omitted silently rather than returning an error page. - -### `raw=1` mode - -Returns the TopN response as JSON (same format as the CLI's `--json`). Useful for scripting -and `curl` without needing the CLI binary. - -### Drill-down URL construction - -Dimension advance hierarchy (for row-click links): - -``` -website → CLIENT_PREFIX → REQUEST_URI → HTTP_RESPONSE → (no advance; all dims filtered) -``` - -Row-click URL: take current params, add the filter for the current `by` dimension, and set -`by` to the next dimension. If already on the last dimension (`status`), keep `by` unchanged. - -### Types - -```go -type QueryParams struct { - Target string - Window pb.Window - WindowS string // "5m" — for display - GroupBy pb.GroupBy - GroupByS string // "website" — for display - N int - Filter filterState -} - -type filterState struct { - Website string - Prefix string - URI string - Status string // string so empty means "unset" -} - -type TableRow struct { - Rank int - Label string - Count int64 - Pct float64 // 0–100, relative to top entry - DrillURL string // href for this row -} - -type Crumb struct { - Text string // e.g. "website=example.com" - RemoveURL string // current URL with this filter removed -} - -type PageData struct { - Params QueryParams - Source string - Entries []TableRow - TotalCount int64 - Sparkline template.HTML // "" if trend call failed - Breadcrumbs []Crumb - RefreshSecs int - Error string // non-empty → show error banner, no table -} -``` - ---- - -## Step 4 — sparkline.go - -```go -func renderSparkline(points []*pb.TrendPoint) template.HTML -``` - -- Fixed `viewBox="0 0 300 60"` SVG. -- X axis: evenly-spaced buckets across 300 px. -- Y axis: linear scale from 0 to max count, inverted (SVG y=0 is top). -- Rendered as a `` with `stroke` and `fill="none"`. Minimal inline style, no classes. -- If `len(points) < 2`, returns `""` (no sparkline). -- Returns `template.HTML` (already-escaped) so the template can emit it with `{{.Sparkline}}`. - ---- - -## Step 5 — templates/ - -Two files, embedded with `//go:embed templates/*.html` and parsed once at startup. - -### `templates/base.html` (define "base") - -Outer HTML skeleton: -- `` (omitted if `RefreshSecs == 0`) -- Minimal inline CSS: monospace font, max-width 1000px, table styling, breadcrumb strip -- Yields a `{{template "content" .}}` block - -No external CSS, no web fonts, no icons. Legible in a terminal browser (w3m, lynx). - -### `templates/index.html` (define "content") - -Sections in order: - -**Window tabs** — `1m | 5m | 15m | 60m | 6h | 24h`; current window is bold/underlined; -each is a link that swaps only `w=` in the URL. - -**Group-by tabs** — `by website | by prefix | by uri | by status`; current group highlighted; -links swap `by=`. - -**Filter breadcrumb** — shown only when at least one filter is active: -``` -Filters: [website=example.com ×] [status=429 ×] -``` -Each `×` is a link to the URL without that filter. - -**Error banner** — shown instead of table when `.Error` is non-empty. - -**Trend sparkline** — the SVG returned by `renderSparkline`, inline. Labelled with window -and source. Omitted when `.Sparkline == ""`. - -**TopN table**: -``` -RANK LABEL COUNT % TREND - 1 example.com 18 432 62 % ████████████ - 2 other.com 4 211 14 % ████ -``` -- `LABEL` column is a link (`DrillURL`). -- `%` is relative to the top entry (rank-1 always 100 %). -- `TREND` bar is an inline `` tag — renders as a native browser bar, - degrades gracefully in text browsers to `N/100`. -- Rows beyond rank 3 show the percentage bar only if it's > 5 %, to avoid noise. - -**Footer** — "source: queried refresh 30 s" — lets operators confirm -which endpoint they're looking at. - ---- - -## Step 6 — Tests (`frontend_test.go`) - -In-process fake gRPC server (same pattern as aggregator and CLI tests). - -| Test | What it covers | -|------|----------------| -| `TestParseQueryParams` | All URL params parsed correctly; defaults applied | -| `TestParseQueryParamsInvalid` | Bad `n`, bad `w`, bad `f_status` → defaults or 400 | -| `TestBuildFilterFromParams` | Populated filter; nil when nothing set | -| `TestDrillURL` | website → prefix drill; prefix → uri drill; status → no advance | -| `TestBuildCrumbs` | One crumb per active filter; remove-URL drops just that filter | -| `TestRenderSparkline` | 5 points → valid SVG containing `` | -| `cmd/frontend/format.go` | `fmtCount()` — space-separated thousands, registered as template func | -| `cmd/frontend/templates/base.html` | Outer HTML shell, inline CSS, meta-refresh | -| `cmd/frontend/templates/index.html` | Window tabs, group-by tabs, breadcrumb, sparkline, table, footer | - -### Deviations from the plan - -- **`format.go` extracted**: `fmtCount` placed in its own file (not in `handler.go`) so it can - be tested independently without loading the template. -- **`TestDialFake` added**: sanity check for the fake gRPC infrastructure used by the other tests. -- **`TestHandlerNoData` added**: verifies the "no data" message renders correctly when the server - returns an empty entry list. Total tests: 23 (plan listed 13). -- **`% relative to rank-1`** as planned; the `` shows 100% for rank-1 - and proportional bars below. Rank-1 is always the visual baseline. -- **`status → website` drill cycle**: clicking a row in the `by status` view adds `f_status` - and resets `by=website` (cycles back to the start of the drilldown hierarchy). - -### Test results - -``` -$ go test ./... -count=1 -race -timeout 60s -ok git.ipng.ch/ipng/nginx-logtail/cmd/frontend 1.1s (23 tests) -ok git.ipng.ch/ipng/nginx-logtail/cmd/cli 1.0s (14 tests) -ok git.ipng.ch/ipng/nginx-logtail/cmd/aggregator 4.1s (13 tests) -ok git.ipng.ch/ipng/nginx-logtail/cmd/collector 9.7s (17 tests) -``` - -### Test inventory - -| Test | What it covers | -|------|----------------| -| `TestParseWindowString` | All 6 window strings + bad input → default | -| `TestParseGroupByString` | All 4 group-by strings + bad input → default | -| `TestParseQueryParams` | All URL params parsed correctly | -| `TestParseQueryParamsDefaults` | Empty URL → handler defaults applied | -| `TestBuildFilter` | Filter proto fields set from filterState | -| `TestBuildFilterNil` | Returns nil when no filter set | -| `TestDrillURL` | website→prefix, prefix→uri, status→website cycle | -| `TestBuildCrumbs` | Correct text and remove-URLs for active filters | -| `TestRenderSparkline` | 5 points → SVG with polyline | -| `TestRenderSparklineTooFewPoints` | nil/1 point → empty string | -| `TestRenderSparklineAllZero` | All-zero counts → empty string | -| `TestFmtCount` | Space-thousands formatting | -| `TestHandlerTopN` | Fake server; labels and formatted counts in HTML | -| `TestHandlerRaw` | `raw=1` → JSON with source/window/group_by/entries | -| `TestHandlerBadTarget` | Unreachable target → 502 + error message in body | -| `TestHandlerFilterPassedToServer` | `f_website` + `f_status` reach gRPC filter | -| `TestHandlerWindowPassedToServer` | `w=60m` → `pb.Window_W60M` in request | -| `TestHandlerBreadcrumbInHTML` | Active filter renders crumb with × link | -| `TestHandlerSparklineInHTML` | Trend points → `` in page | -| `TestHandlerPctBar` | 100% for rank-1, 50% for half-count entry | -| `TestHandlerWindowTabsInHTML` | All 6 window labels rendered as links | -| `TestHandlerNoData` | Empty entry list → "no data" message | -| `TestDialFake` | Test infrastructure sanity check | - ---- - -## Deferred (not in v0) - -- Dark mode (prefers-color-scheme media query) -- Per-row mini sparklines (one Trend RPC per table row — expensive; need batching first) -- WebSocket or SSE for live push instead of meta-refresh -- Pagination for large N -- `?format=csv` download -- OIDC/basic-auth gating -- ClickHouse-backed 7d/30d windows (tracked in README) diff --git a/docs/design.md b/docs/design.md new file mode 100644 index 0000000..9f93617 --- /dev/null +++ b/docs/design.md @@ -0,0 +1,608 @@ + +# nginx-logtail Design Document + +## Metadata + +| | | +| --- | --- | +| **Status** | Describes intended behavior as of `v0.2.0` | +| **Author** | Pim van Pelt `` | +| **Last updated** | 2026-04-17 | +| **Audience** | Operators and contributors running real-time traffic analysis and DDoS detection across a fleet of nginx hosts | + +The key words **MUST**, **MUST NOT**, **SHOULD**, **SHOULD NOT**, and **MAY** are used as described in +[RFC 2119](https://datatracker.ietf.org/doc/html/rfc2119), and are reserved in this document for requirements that are intended to be +enforced in code or by an external dependency. Plain-language descriptions of what the system or an operator can do are written in +lowercase — "can", "will", "does" — and should not be read as normative. + +## Summary + +`nginx-logtail` is a four-binary Go system for real-time analysis of nginx traffic across a fleet of hosts. Each nginx host runs a +**collector** that ingests logs (from files via `fsnotify`, from a UDP socket, or both) and maintains in-memory ranked top-K counters +across multiple time windows. A central **aggregator** subscribes to the collectors' snapshot streams and serves a merged view. An +**HTTP frontend** renders a drilldown dashboard (server-rendered HTML, zero JavaScript). A **CLI** offers the same queries as a +shell companion. All four programs speak a single gRPC service (`LogtailService`), so the frontend and CLI work against any collector +or the aggregator interchangeably. + +## Background + +Operators running tens of nginx hosts behind a load balancer need a live, drilldown view of request traffic for DDoS detection and +traffic analysis. Questions the system answers include: + +- Which client prefix is causing the most HTTP 429s right now? +- Which website is getting the most 503s over the last 24 hours? +- Which nginx machine is the busiest? +- Is there a DDoS in progress, and from where? + +Existing log-analysis pipelines (ELK, Loki, ClickHouse, etc.) answer questions like these but require infrastructure that is +disproportionate for the target workload. A handful of nginx hosts each doing ~10 K req/s at peak can be kept on a per-minute top-K +structure in ~1 GB of RAM per host, with <250 ms query latency across the whole fleet, without a storage tier. + +A companion project, [`nginx-ipng-stats-plugin`](https://git.ipng.ch/ipng/nginx-ipng-stats-plugin), adds per-device attribution in nginx +itself and can emit a logtail-format access log as UDP datagrams. `nginx-logtail` was extended in `v0.2.0` to ingest that stream +natively, so operators can run it either from on-disk log files, from the UDP feed, or both on the same host. + +## Goals and Non-Goals + +### Product Goals + +1. **Live top-K per (website, client_prefix, URI, status, is_tor, asn, source_tag).** For every combination of these dimensions the + system maintains an integer count, ranked so that the top entries are readily available across 1 m, 5 m, 15 m, 60 m, 6 h, and 24 h + windows. +2. **Sub-second query latency.** `TopN` and `Trend` queries MUST return from the collector and from the aggregator in well under one + second at the target scale (10 hosts, 10 K req/s each). +3. **Bounded memory.** The collector MUST stay within a 1 GB steady-state memory budget regardless of input cardinality, including + during high-cardinality DDoS attacks. +4. **Two ingest paths, one data model.** On-disk log files (`fsnotify`-tailed, logrotate-aware) and UDP datagrams (from + `nginx-ipng-stats-plugin`) MUST both feed the same in-memory structure, with a single log format per path and no operator-visible + difference downstream. +5. **No external storage, no TLS, no CGO.** The entire system runs as four static Go binaries on a trusted internal network. Operators + who need retention beyond the ring buffers SHOULD scrape Prometheus. +6. **One service contract.** Collectors and the aggregator implement the same gRPC `LogtailService`. Frontend and CLI MUST work + against either interchangeably, with the collector returning "itself" from `ListTargets` and the aggregator returning its configured + collector set. + +### Non-Goals + +- The system does **not** parse arbitrary nginx `log_format` strings. Two fixed tab-separated formats are supported: a file format and + a UDP format (see FR-2). Operators who need general parsing should use Vector, Fluent Bit, or Promtail. +- The system does **not** store raw log lines. Counts are aggregated at ingest; the original log lines are not kept in memory or on + disk. The project does not replace an access log. +- The system does **not** persist counters across restarts. Ring buffers are in-memory only. On aggregator restart, historical state + is reconstructed by calling `DumpSnapshots` on each collector (FR-4.3). On collector restart the rings start empty and refill as new + traffic arrives. +- The system does **not** provide per-URI request timing distributions. Latency histograms exist only in the collector's Prometheus + exposition (per host), not in the top-K data model. +- The system does **not** ship TLS or authentication for its gRPC endpoints. Operators who expose it beyond a trusted network are + expected to terminate TLS in a front proxy. +- The system is **not** a general-purpose metric store. The Prometheus exporter on the collector exposes a deliberately narrow set: + per-host request counter, per-host body-size and request-time histograms, and per-`source_tag` rollup counters. + +## Requirements + +Each requirement carries a unique identifier (`FR-X.Y` or `NFR-X.Y`) so that later sections can cite it. + +### Functional Requirements + +**FR-1 Counter data model** + +- **FR-1.1** The canonical unit of counting MUST be a 7-tuple + `(website, client_prefix, http_request_uri, http_response, is_tor, asn, ipng_source_tag)` mapped to a 64-bit integer request count. + The data model contains no other fields: no timing, no byte counts, no method (those live only in the Prometheus exposition, + FR-8). +- **FR-1.2** `website` MUST be the nginx `$host` value. +- **FR-1.3** `client_prefix` MUST be the client IP truncated to a configurable prefix length, formatted as CIDR. Default `/24` for + IPv4 and `/48` for IPv6 (flags `-v4prefix`, `-v6prefix`). Truncation happens at ingest; the original address is not retained. +- **FR-1.4** `http_request_uri` MUST be the `$request_uri` path only — the query string (from the first `?` onward) MUST be stripped + at ingest. This is the dominant cardinality-reduction measure; DDoS traffic with attacker-generated query strings cannot grow the + working set. +- **FR-1.5** `http_response` MUST be the HTTP status code as recorded by nginx. +- **FR-1.6** `is_tor` MUST be a boolean, populated by the operator in the log format (typically via a lookup against a TOR exit-node + list). For the file format, lines without this field default to `false` for backward compatibility. +- **FR-1.7** `asn` MUST be an int32 decimal value sourced from MaxMind GeoIP2 (or equivalent). For the file format, lines without + this field default to `0`. +- **FR-1.8** `ipng_source_tag` MUST be a short string identifying which attribution tag the request arrived under. For records from + on-disk log files, the collector MUST assign the tag `"direct"` (mirroring `nginx-ipng-stats-plugin`'s default-source convention). For + records from the UDP stream, the tag is taken from the log line as emitted by the plugin. + +**FR-2 Log formats** + +- **FR-2.1 File format.** The collector MUST accept nginx access logs in the following tab-separated layout, with the last two fields + (`is_tor`, `asn`) optional for backward compatibility: + + ```nginx + log_format logtail '$host\t$remote_addr\t$msec\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn'; + ``` + + | # | Field | Ingested into | + |---|-------------------|----------------------------| + | 0 | `$host` | `website` | + | 1 | `$remote_addr` | `client_prefix` (truncated)| + | 2 | `$msec` | (discarded) | + | 3 | `$request_method` | Prom `method` label | + | 4 | `$request_uri` | `http_request_uri` | + | 5 | `$status` | `http_response` | + | 6 | `$body_bytes_sent`| Prom body histogram | + | 7 | `$request_time` | Prom duration histogram | + | 8 | `$is_tor` | `is_tor` (optional) | + | 9 | `$asn` | `asn` (optional) | + +- **FR-2.2 UDP format.** The collector MUST accept datagrams in the following tab-separated layout, as emitted by + `nginx-ipng-stats-plugin`'s `ipng_stats_logtail` directive: + + ```nginx + log_format ipng_stats_logtail '$host\t$remote_addr\t$request_method\t$request_uri\t$status\t$body_bytes_sent\t$request_time\t$is_tor\t$asn\t$ipng_source_tag\t$server_addr\t$scheme'; + ``` + + Exactly 12 tab-separated fields are required. `$server_addr` and `$scheme` MUST be parsed but dropped; they are reserved for + future use. Malformed datagrams MUST be counted (FR-8.5) and silently dropped. + +- **FR-2.3** The file tailer MUST set `source_tag="direct"` on every record it parses. The UDP listener MUST propagate + `$ipng_source_tag` verbatim. This is the only difference in downstream processing between the two ingest paths. + +**FR-3 Ring buffers and time windows** + +- **FR-3.1** Each collector and the aggregator MUST maintain two tiered ring buffers: + + | Tier | Bucket size | Buckets | Top-K/bucket | Covers | + |--------|-------------|---------|--------------|--------| + | Fine | 1 min | 60 | 50 000 | 1 h | + | Coarse | 5 min | 288 | 5 000 | 24 h | + +- **FR-3.2** The `Window` enum MUST map queries to tiers as follows: + + | Window | Tier | Buckets summed | + |--------|--------|----------------| + | 1 m | fine | 1 | + | 5 m | fine | 5 | + | 15 m | fine | 15 | + | 60 m | fine | 60 | + | 6 h | coarse | 72 | + | 24 h | coarse | 288 | + +- **FR-3.3** Every minute, the collector MUST snapshot its live map into the fine ring (top-50 000, sorted desc) and reset the live + map. Every fifth fine tick, the collector MUST merge the most recent five fine snapshots into one coarse snapshot (top-5 000). + The fine/coarse merge MUST be pinned to the 1-minute and 5-minute boundaries of the local clock so sparklines align across + collectors. +- **FR-3.4** Querying MUST always read from the rings, never from the live map. A sub-minute request MUST return an empty top-1 + result rather than surfacing partially-accumulated data; this keeps per-minute results monotonic. + +**FR-4 Push-based streaming and aggregation** + +- **FR-4.1** The collector MUST expose a server-streaming RPC `StreamSnapshots(SnapshotRequest) → stream Snapshot` that emits one fine + (1-min) snapshot per minute rotation. Subscribers MUST receive the same snapshot independently (per-subscriber buffered fan-out, + bounded buffer, drop on full). +- **FR-4.2** The aggregator MUST subscribe to every configured collector via `StreamSnapshots` and merge snapshots into a single + ring-buffer cache. The merge strategy MUST be delta-based: on each new snapshot from collector `X`, the aggregator MUST subtract + `X`'s previous contribution and add the new entries, giving `O(snapshot_size)` per update (not `O(N_collectors × size)`). +- **FR-4.3** The aggregator MUST expose a unary `DumpSnapshots(DumpSnapshotsRequest) → stream Snapshot` on each collector that + streams all fine buckets (with `is_coarse=false`) followed by all coarse buckets (with `is_coarse=true`). On aggregator startup, it + MUST call `DumpSnapshots` against every collector once (concurrently, after its own gRPC server is already listening), merge the + per-timestamp entries the same way the live path does, and load the result into its cache via a single atomic replacement. + Collectors that return `Unimplemented` MUST be skipped without blocking live streaming from the others. +- **FR-4.4** The aggregator MUST reconnect to each collector independently with exponential backoff (100 ms → cap 30 s). After three + consecutive connection failures the aggregator MUST zero the degraded collector's contribution (subtract its last-known snapshot + and delete its entry). When the collector recovers and sends a new snapshot, its contribution MUST automatically be reintegrated. + +**FR-5 Query service (`LogtailService`)** + +- **FR-5.1** Collector and aggregator MUST implement the same gRPC `LogtailService`: + + ```protobuf + service LogtailService { + rpc TopN(TopNRequest) returns (TopNResponse); + rpc Trend(TrendRequest) returns (TrendResponse); + rpc StreamSnapshots(SnapshotRequest) returns (stream Snapshot); + rpc ListTargets(ListTargetsRequest) returns (ListTargetsResponse); + rpc DumpSnapshots(DumpSnapshotsRequest)returns (stream Snapshot); + } + ``` + +- **FR-5.2** `Filter` MUST support exact, inequality, and RE2-regex constraints on the dimensions of FR-1. Status and ASN accept + the six-operator expression language (`=`, `!=`, `>`, `>=`, `<`, `<=`). Website and URI accept regex match and regex exclusion. + TOR filtering uses a three-state enum (`ANY`/`YES`/`NO`). Source-tag filtering is exact match only. +- **FR-5.3** `GroupBy` MUST cover every dimension of FR-1 except `is_tor` (which is boolean and rarely useful as a group-by target): + `WEBSITE`, `CLIENT_PREFIX`, `REQUEST_URI`, `HTTP_RESPONSE`, `ASN_NUMBER`, `SOURCE_TAG`. +- **FR-5.4** `ListTargets` MUST return, from the aggregator, every configured collector with its display name and gRPC address; from + a collector, a single entry describing itself with an empty `addr` (meaning "this endpoint"). +- **FR-5.5** All queries MUST be answered from the local ring buffers. The aggregator MUST NOT fan out to collectors at query time. + +**FR-6 HTTP frontend** + +- **FR-6.1** The frontend MUST render a server-rendered HTML dashboard with no JavaScript, using inline SVG for sparklines and + `` for auto-refresh. It MUST work in text-mode browsers (w3m, lynx) and under `curl`. +- **FR-6.2** All filter, group-by, and window state MUST live in the URL query string so that URLs are shareable and bookmarkable. + No server-side session. +- **FR-6.3** The frontend MUST provide a drilldown affordance: clicking a row MUST add that row's value as a filter and advance the + group-by dimension through the cycle + `website → prefix → uri → status → asn → source_tag → website`. +- **FR-6.4** The frontend MUST issue `TopN`, `Trend`, and `ListTargets` concurrently with a 5 s deadline. `Trend` failure MUST + suppress the sparkline but not the table. `ListTargets` failure MUST hide the source picker but not the rest of the page. +- **FR-6.5** Appending `&raw=1` to any URL MUST return the `TopN` result as JSON, so the dashboard can be scripted without the CLI. +- **FR-6.6** The frontend MUST accept a `q=` parameter holding a mini filter expression (`status>=400 AND website~=gouda.*`). On + submission it MUST parse the expression and redirect to the canonical URL with the individual `f_*` params populated; parse errors + MUST render inline without losing the current filter state. + +**FR-7 CLI** + +- **FR-7.1** The CLI MUST provide four subcommands: `topn`, `trend`, `stream`, `targets`. Each subcommand MUST accept + `--target host:port[,host:port...]` and fan out concurrently, printing results in order with per-target headers (omitted for + single-target invocations, so output pipes cleanly into `jq`). +- **FR-7.2** The CLI MUST expose every `Filter` dimension as a dedicated flag and default to a human-readable table. `--json` MUST + switch to newline-delimited JSON for `stream` and to a single JSON array for `topn`/`trend`. +- **FR-7.3** `stream` MUST reconnect automatically on error with a 5 s backoff and run until interrupted. + +**FR-8 Prometheus exposition (collector only)** + +- **FR-8.1** The collector MUST expose a Prometheus `/metrics` endpoint on `-prom-listen` (default `:9100`). Setting the flag to the + empty string MUST disable it entirely. +- **FR-8.2** The collector MUST expose a per-request counter `nginx_http_requests_total{host, method, status}` capped at + `promCounterCap = 250 000` distinct label sets. When the cap is reached, further new label sets MUST be dropped (existing series + keep incrementing) until the map is rolled over. +- **FR-8.3** The collector MUST expose per-host histograms + `nginx_http_response_body_bytes{host, le}` (body-size distribution) and + `nginx_http_request_duration_seconds{host, le}` (request-time distribution). The duration histogram MUST NOT be split by + `source_tag` — its bucket count would multiply without operational benefit. +- **FR-8.4** The collector MUST expose two parallel roll-ups labeled by `source_tag` only (not cross-producted with host): + `nginx_http_requests_by_source_total{source_tag}` and + `nginx_http_response_body_bytes_by_source{source_tag, le}`. These are separate metric names to avoid inconsistent label sets + under a single name. +- **FR-8.5** The collector MUST expose three counters that let operators distinguish UDP parse failures from back-pressure drops: + `logtail_udp_packets_received_total` (datagrams off the socket), + `logtail_udp_loglines_success_total` (parsed OK), and + `logtail_udp_loglines_consumed_total` (forwarded to the store — i.e. not dropped). + +### Non-Functional Requirements + +**NFR-1 Correctness under concurrency** + +- **NFR-1.1** The collector MUST run a single goroutine ("the store goroutine") that owns the live map and the ring-buffer write + path. No other goroutine MUST write to these structures. The file tailer and the UDP listener MUST communicate with the store + goroutine through a bounded channel. +- **NFR-1.2** Readers (query RPCs and subscriber fan-out) MUST take an `RLock` on the rings. Writers MUST take a `Lock` only for the + moment the slice header of the new snapshot is installed; serialisation and network I/O MUST happen outside the lock. +- **NFR-1.3** `DumpSnapshots` MUST copy ring headers and filled counts under `RLock` only, then release the lock before streaming. + The minute-rotation write path MUST never observe a lock held for longer than a microsecond-scale slice copy. +- **NFR-1.4** A query that races with a rotation MUST observe a monotonically non-decreasing total for a fixed filter over a fixed + window; it MUST NOT observe a partially-rotated state that would cause a total to decrease compared to a prior reading. + +**NFR-2 Memory bounds** + +- **NFR-2.1** The collector's live map MUST be hard-capped at 100 000 entries. Once the cap is reached, only updates to existing keys + MUST proceed; new keys MUST be dropped until the next minute rotation resets the map. This bounds memory under high-cardinality + attacks. +- **NFR-2.2** Fine-ring snapshots MUST be capped at top-50 000 entries; coarse-ring snapshots at top-5 000. The full memory budget + for a collector is therefore approximately 845 MB (live map ~19 MB + fine ring ~558 MB + coarse ring ~268 MB). +- **NFR-2.3** The aggregator MUST apply the same tier caps as the collector. Its steady-state memory is roughly equivalent to one + collector regardless of the number of collectors subscribed. +- **NFR-2.4** The Prometheus counter map (FR-8.2) MUST be capped at `promCounterCap = 250 000` entries. The per-host and per-source + histograms MUST NOT be capped explicitly — they grow only with the distinct host count, which is bounded by the operator's vhost + configuration. + +**NFR-3 Performance** + +- **NFR-3.1** `ParseLine` and `ParseUDPLine` MUST use `strings.Split` / `strings.SplitN` (no regex), so that per-line cost stays + around 50 ns on commodity hardware. +- **NFR-3.2** `TopN` and `Trend` queries across the full 24-hour coarse ring MUST complete in well under 250 ms at the 50 000-entry + fine cap, for fully-specified filters. +- **NFR-3.3** The collector's input channel MUST be sized to absorb approximately 20 s of peak load (e.g. 200 000 at 10 K lines/s) + so that transient pauses in the store goroutine do not back up the tailer or the UDP listener. +- **NFR-3.4** When either the tailer or the UDP listener cannot enqueue a parsed record because the channel is full, the record + MUST be dropped rather than blocking the ingest goroutine. UDP drops MUST be visible via the counters in FR-8.5; file-path drops + are implicit (the tailer falls behind the file). + +**NFR-4 Fault tolerance and recovery** + +- **NFR-4.1** The file tailer MUST tolerate logrotate automatically. On `RENAME`/`REMOVE` events it MUST drain the old file + descriptor to EOF, close it, and retry opening the original path with exponential backoff until the new file appears. No SIGHUP or + restart MUST be required. +- **NFR-4.2** The aggregator MUST NOT block frontend queries during backfill. Its gRPC server MUST start listening first; backfill + (FR-4.3) MUST run in a background goroutine. +- **NFR-4.3** A collector restart MUST NOT affect peer collectors or the aggregator's ability to continue serving the surviving + collectors' data. When the restarted collector reconnects, its stream MUST resume without operator action. +- **NFR-4.4** An aggregator restart MUST recover its ring-buffer contents from all collectors via `DumpSnapshots`; live streaming + MUST resume in parallel with backfill so that no minute is lost even during a restart. + +**NFR-5 Observability of the system itself** + +- **NFR-5.1** The collector MUST expose operator-facing log lines on stdout covering: file discovery, logrotate reopen events, UDP + listener bind, subscriber connect/disconnect, and fatal configuration errors. The collector MUST NOT log anything on the per-request + hot path. +- **NFR-5.2** The aggregator MUST log each collector's connect, disconnect, degraded transition, and recovery. Backfill MUST log a + per-collector line with bucket counts, entry counts, and wall-clock duration. +- **NFR-5.3** The Prometheus exporter MUST be the primary out-of-band health signal. Counters FR-8.5 plus the per-host request + counter (FR-8.2) give an operator a full view of ingest health without needing to read the logs. + +**NFR-6 Security** + +- **NFR-6.1** gRPC traffic MUST be cleartext HTTP/2. Operators who expose the endpoints beyond a trusted network are expected to + terminate TLS in a front proxy. +- **NFR-6.2** The collector MUST bind its UDP listener to `127.0.0.1` by default (configurable via `-logtail-bind`) so that merely + setting `-logtail-port` MUST NOT expose the socket to the public Internet. +- **NFR-6.3** The system MUST NOT record per-request personally-identifying data beyond what nginx already logs. Client IPs are + truncated at ingest (FR-1.3); URIs lose their query strings (FR-1.4). + +**NFR-7 Documentation and packaging** + +- **NFR-7.1** The repository MUST ship `docs/user-guide.md` that walks an operator through nginx log format configuration, running + each of the four binaries (flags, systemd examples, Docker Compose), and integrating the Prometheus exporter. It MUST contain + enough examples that a new operator can stand up a single-host deployment end-to-end without reading the source. +- **NFR-7.2** The repository MUST ship `docs/design.md` (this document) covering the normative requirements and the architectural + rationale. +- **NFR-7.3** All four binaries MUST build as static Go binaries with `CGO_ENABLED=0 -trimpath -ldflags="-s -w"` and MUST ship + together in a single `scratch`-based Docker image. No OS, no shell, no runtime dependencies. + +## Architecture Overview + +### Process Model + +The project ships four binaries: + +- **`collector`** — runs on every nginx host. Ingests logs from files and/or UDP, maintains the live map and tiered rings, serves + `LogtailService` on port 9090, and exposes Prometheus on port 9100. +- **`aggregator`** — runs centrally. Subscribes to every collector, merges snapshots, serves the same `LogtailService` on port 9091. +- **`frontend`** — runs centrally, alongside the aggregator. HTTP server on port 8080, rendering HTML against the aggregator (or any + other `LogtailService` endpoint). +- **`cli`** — runs wherever the operator is. Talks to any `LogtailService`. No daemon. + +Because all four binaries speak one service, the aggregator is optional for a single-host deployment: the frontend and CLI can point +directly at a collector. + +### Data Flow + +``` + ┌──────────────┐ files ┌───────────────┐ + nginx ──▶ │ access.log │───────▶│ file tailer │ + │ (file mode) │ │ (fsnotify) │──┐ + └──────────────┘ └───────────────┘ │ + │ + ┌──────────────┐ UDP ┌───────────────┐ │ + nginx-ipng ▶ ipng_stats_ ├───────▶│ udp listener │──┼──▶ LogRecord ──▶ ┌──────────┐ + -stats- │ logtail │ │ (127.0.0.1) │ │ channel (200K)│ store │ + plugin └──────────────┘ └───────────────┘ │ │ goroutine│ + │ └─────┬────┘ + ▼ │ + Prom exporter │ + ▼ + ┌─────────────┐ + │ live map │ + │ (≤100 K) │ + └──────┬──────┘ + │ every 1 m + ▼ + ┌─────────────┐ + │ fine ring │ + │ 60×50 K │────┐ + └──────┬──────┘ │ + │ every 5 m │ + ▼ │ + ┌─────────────┐ │ + │ coarse ring │ │ + │ 288×5 K │ │ + └─────────────┘ │ + │ + ┌──────────────────────────────────────┘ + │ StreamSnapshots (push) + ▼ + aggregator ──▶ merged cache ──▶ frontend / CLI +``` + +Requests enter nginx. The nginx writes either to a log file (file mode) or via the `ipng_stats_logtail` directive to a UDP socket +(UDP mode), or both. The collector has two ingest goroutines that parse a line into a `LogRecord` and enqueue it on a shared 200 K +channel. A single store goroutine consumes the channel, updating the live map and maintaining the tiered rings. A once-per-minute +timer rotates the live map into the fine ring and (every fifth tick) into the coarse ring, and fans the fresh snapshot out to every +`StreamSnapshots` subscriber. The aggregator is one such subscriber. + +Query RPCs (`TopN`, `Trend`) MUST read only from the rings and MUST NOT read from the live map. The aggregator's cache is itself a +ring built from the merged-view snapshots; it is updated on the same 1-minute cadence regardless of how many collectors are +connected. + +## Components + +### Program 1 — Collector (`cmd/collector`) + +#### Responsibilities + +- Tail on-disk log files via a single `fsnotify.Watcher`, handle logrotate, and re-scan glob patterns periodically to pick up new + files (FR-2.1, NFR-4.1). +- Listen on an optional UDP socket for `ipng_stats_logtail` datagrams (FR-2.2). +- Parse each log line into a `LogRecord` (FR-1). +- Maintain the live map, fine ring, coarse ring, and subscriber fan-out under a single-writer goroutine (FR-3, NFR-1). +- Serve `LogtailService` on `-listen` (FR-5). +- Expose Prometheus metrics on `-prom-listen` (FR-8). + +#### Key data types + +- `LogRecord` — ten fields (website, client_prefix, URI, status, is_tor, asn, method, body_bytes_sent, request_time, source_tag). + Produced by `ParseLine` or `ParseUDPLine` and consumed by the store goroutine. +- `Tuple6` (historical name; carries seven fields now) — the aggregation key. NUL-separated when encoded as a map key for snapshots. + The code name is intentionally stable so downstream tests and consumers are not churned. +- `Snapshot` — `(timestamp, []Entry)` where `Entry = (label, count)` and `label` is an encoded `Tuple6`. + +#### Presents + +- `LogtailService` on TCP (default `:9090`). +- A Prometheus `/metrics` handler on TCP (default `:9100`). + +#### Consumes + +- One or more on-disk log files matched by `--logs` and/or `--logs-file` globs. +- Optionally, a UDP socket on `--logtail-bind:--logtail-port` (default `127.0.0.1`, disabled when port is `0`). + +### Program 2 — Aggregator (`cmd/aggregator`) + +#### Responsibilities + +- Dial every configured collector and subscribe via `StreamSnapshots` (FR-4.2). +- Merge incoming snapshots into a single cache using delta-based subtraction, so a collector's contribution is updated in place + rather than accumulated (FR-4.2). +- At startup, call `DumpSnapshots` on each collector once, merge the per-timestamp entries, and load the result into the cache + atomically (FR-4.3). +- Handle collector outages with exponential-backoff reconnect and degraded-collector zeroing (FR-4.4). +- Serve the same `LogtailService` as the collector (FR-5). +- Maintain a `TargetRegistry` that maps collector addresses to display names (updated from the `source` field of incoming + snapshots). + +#### Presents + +- `LogtailService` on TCP (default `:9091`). + +#### Consumes + +- The `StreamSnapshots` and `DumpSnapshots` RPCs on every configured collector (`--collectors`). + +### Program 3 — Frontend (`cmd/frontend`) + +#### Responsibilities + +- Render the drilldown dashboard server-side with no JavaScript (FR-6.1). +- Parse URL query string into filter / group-by / window state (FR-6.2). +- Issue `TopN`, `Trend`, and `ListTargets` concurrently with a 5 s deadline (FR-6.4). +- Render inline SVG sparklines from `TrendResponse` (FR-6.1). +- Support the mini filter-expression language (FR-6.6) and the `raw=1` JSON output (FR-6.5). +- Expose a source-picker row populated from `ListTargets`. + +#### Presents + +- An HTTP dashboard on TCP (default `:8080`). + +#### Consumes + +- Any `LogtailService` endpoint (`--target`, default `localhost:9091` — the aggregator). + +### Program 4 — CLI (`cmd/cli`) + +#### Responsibilities + +- Dispatch to `topn`, `trend`, `stream`, or `targets` (FR-7.1). +- Parse shared and per-subcommand flags, build a `Filter` proto from them, and fan out to every `--target` concurrently (FR-7.2). +- Print human-readable tables by default; switch to JSON with `--json` (FR-7.2). +- Reconnect automatically in `stream` mode (FR-7.3). + +#### Presents + +- Exit status `0` on success, non-zero on RPC error (except `stream`, which runs until interrupted). + +#### Consumes + +- Any `LogtailService` endpoint. + +### Protobuf service (`proto/logtail.proto`) + +One proto file defines every shared type: `Tuple6` is encoded as a NUL-separated label string inside `TopNEntry`, and the +`Snapshot` message carries both fine (1-min) and coarse (5-min) ring contents. `GroupBy` and `Window` are enums; `Filter` carries +optional exact-match fields, regex fields, and the `StatusOp` comparison enum used for both `http_response` and `asn_number`. + +## Operational Concerns + +### Deployment Topology + +A typical deployment is: + +- **Per nginx host:** one `collector` systemd unit, pointed at `/var/log/nginx/*.log` and/or listening on `127.0.0.1:9514` for the + `nginx-ipng-stats-plugin` UDP stream. Exposes `:9090` (gRPC) and `:9100` (Prometheus). +- **Central:** one `aggregator` systemd unit on e.g. `agg:9091`, subscribed to all collectors; and one `frontend` systemd unit on + `agg:8080`, pointed at the aggregator. Operators reach the dashboard via `http://agg:8080/`. Alternatively, the Docker Compose + file in the repo root runs the aggregator and frontend together. +- **Operator laptop:** `logtail-cli` invocations, pointed at the aggregator for fleet-wide questions or at a specific collector for + a single-host drilldown. + +### Configuration + +All four binaries are configured via flags with matching environment variables. The canonical reference is `docs/user-guide.md`. +Representative settings: + +- `collector`: `--logs /var/log/nginx/*.log`, `--logtail-port 9514`, `--source $(hostname)`, `--prom-listen :9100`. +- `aggregator`: `--collectors nginx1:9090,nginx2:9090`, `--listen :9091`. +- `frontend`: `--target agg:9091`, `--listen :8080`. +- `cli`: no persistent configuration; every invocation carries `--target`. + +### Reload and Restart Semantics + +- **Collector restart.** The live map and both rings start empty. The file tailer resumes at EOF of each watched file (no historical + replay). The fine ring refills within an hour; the coarse ring within 24 hours. +- **Aggregator restart.** Backfill reconstructs the cache from all collectors' `DumpSnapshots` streams. The gRPC server is listening + before backfill begins (NFR-4.2), so the frontend is never blocked during restart — it just sees an incomplete cache for the few + seconds backfill takes. +- **Collector outage.** The aggregator reconnects with backoff; after three consecutive failures the collector's contribution is + zeroed (FR-4.4) so the merged view does not show stale counts. On recovery the zeroing is reversed by the next snapshot. +- **nginx logrotate.** The collector drains the old fd, closes, and retries the original path. No operator action (NFR-4.1). +- **nginx-ipng-stats-plugin reload.** The plugin's UDP socket is per-worker; a reload simply causes new workers to open fresh + sockets to the same address. The collector sees a brief gap and resumes. + +### Observability of the System Itself + +Primary channel is the collector's Prometheus endpoint (FR-8). Beyond the per-host request counter and the per-source roll-ups, +three UDP counters give direct visibility into the UDP ingest path: + +- `logtail_udp_packets_received_total` — what arrived. +- `logtail_udp_loglines_success_total` — what parsed cleanly. +- `logtail_udp_loglines_consumed_total` — what made it to the store (i.e. was not dropped by a full channel). + +`received - success` is the parse-failure rate; `success - consumed` is the back-pressure drop rate. Operators should alert on both +being non-zero. + +Each binary logs human-readable lines on stdout for connect/disconnect events, logrotate reopen, backfill timing, and degraded +transitions. No per-request logging. + +### Failure Modes + +- **High-cardinality DDoS.** The live map hits 100 000 entries and stops accepting new keys until the next rotation (NFR-2.1). + Existing top-K entries keep accumulating, so the attacker's dominant prefixes / URIs remain visible. The cap resets every minute. +- **Collector crash.** In-flight live-map state for the current minute is lost. The next collector start resumes tailing; the + aggregator zeroes the degraded collector's contribution after a few seconds and reintegrates it when snapshots resume. +- **Aggregator crash.** No collector is affected. The operator restarts the aggregator; backfill reconstructs the cache. +- **Frontend crash.** Stateless. Operator restarts. +- **UDP datagram loss.** Any datagram dropped in-kernel (socket buffer full, network drop) does not register as a parse failure; it + is simply invisible. Operators should size `SO_RCVBUF` appropriately; the collector already requests 4 MiB. +- **Malformed log lines.** File format: lines with <8 tab-separated fields are silently skipped; an invalid IP also drops the line. + UDP: packets without exactly 12 fields are counted as received-but-not-success and dropped. +- **Clock skew between collectors.** Trend sparklines derived from merged data assume collectors are roughly NTP-synced. Per-bucket + alignment is to the local minute / 5-minute boundary of each collector. +- **gRPC traffic over untrusted links.** The system does not ship TLS; operators should front the gRPC ports with a TLS-terminating + proxy or an IPsec tunnel. + +### Security + +- **No TLS, no auth.** Deliberate (NFR-6.1). Deploy on a trusted network or behind a TLS proxy. +- **UDP bind.** Default `127.0.0.1` so merely turning on the listener does not expose a public socket (NFR-6.2). +- **Client-IP truncation.** Client addresses are truncated at ingest; the system never stores full client IPs (NFR-6.3, FR-1.3). +- **Query-string stripping.** URIs lose their query strings at ingest. A user who cares about `?q=` parameters must re-engineer + nginx's log format — and then accept that cardinality consequence. + +## Alternatives Considered + +- **Log shipping to ClickHouse / ELK.** Rejected as the default: adds a storage tier to a problem that fits in a per-host 1 GB + ring, for the target fleet size. A future ClickHouse export from the aggregator is viable and would be additive (deferred). +- **Raw request logging to Kafka.** Rejected: preserves every request at much higher cost for no visibility benefit; the operator + wants top-K ranking, not a replay log. If raw logging is desired, nginx's own access log is the right tool. +- **Promtail / Grafana Loki.** Rejected as the primary interface. Loki is excellent for free-text log search but weak for fast + ranked aggregations over dozens of dimensions; the drilldown interaction the operator wants fits poorly into LogQL. +- **In-process Lua aggregator on each nginx.** Considered for the collector tier. Rejected: shipping counters to a central view + still requires a process outside nginx; keeping the ingest path out of the nginx worker avoids a class of latency regressions. +- **pull-based collector polling (aggregator polls collectors every second).** Rejected in favor of push. Polling multiplies query + latency and makes the aggregator's cache stale by the poll interval. Push-stream with delta merge keeps the cache within seconds + of real time. +- **One metric name for both per-host and per-source_tag roll-ups.** Rejected for Prometheus hygiene. Mixing different label sets + under one metric name breaks aggregation rules; separate metric names (`_by_source`) are clearer and easier to query. +- **Cross-product of `host × source_tag` for every counter and histogram.** Rejected. With ~20 tags and ~50 hosts the cardinality + explodes quickly on the duration histogram without operational benefit. The duration histogram stays per-host; requests and body + size get a parallel `_by_source` rollup. +- **Writing every `snapshot` to disk for restart recovery.** Rejected in favor of `DumpSnapshots` RPC backfill. Disk-backed + persistence would multiply operational surface (rotation, fsck, permissions) for a feature that needs to survive only an + aggregator restart. + +## Decisions Deferred Post-v0.2 + +- **ClickHouse export from aggregator.** 1-minute pre-aggregated rows pushed into a `SummingMergeTree` table for 7-day / 30-day + windows. Frontend would route longer windows to ClickHouse while shorter windows stay on the in-memory rings. Strictly additive; + no interface changes. Deferred until a concrete retention requirement lands. +- **TLS on gRPC endpoints.** The argument for shipping TLS changes if/when the aggregator is deployed across an untrusted network + segment. Until then, a front proxy is the right shape. +- **Ring-buffer sizing on a per-collector basis.** Today every collector ships the same 60×50 K / 288×5 K dimensions. A + low-traffic collector can afford smaller rings; a hot one might want larger. Deferred — the uniform default is operationally + simpler. +- **Authenticated Prometheus scraping.** The endpoint is currently open on `:9100`. If a future deployment puts the scraper on a + less-trusted path, scrape-side auth (bearer token, TLS client cert) is the right add-on. +- **Coarse tier beyond 24 h.** Extending to 7 days in-memory would cost ~70 MB per collector but add 2016 buckets to iterate on a + `W24H+` query. Deferred until the operator wants a 7-day drilldown without ClickHouse. diff --git a/docs/USERGUIDE.md b/docs/user-guide.md similarity index 100% rename from docs/USERGUIDE.md rename to docs/user-guide.md diff --git a/internal/store/store.go b/internal/store/store.go index 8966fc4..a8c912d 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -21,14 +21,16 @@ const ( CoarseEvery = 5 // fine ticks between coarse writes ) -// Tuple6 is the aggregation key (website, prefix, URI, status, is_tor, asn). +// Tuple6 is the aggregation key (website, prefix, URI, status, is_tor, asn, source_tag). +// The name is kept for source-compat with older call sites; it now carries seven fields. type Tuple6 struct { - Website string - Prefix string - URI string - Status string - IsTor bool - ASN int32 + Website string + Prefix string + URI string + Status string + IsTor bool + ASN int32 + SourceTag string } // Entry is a labelled count used in snapshots and query results. @@ -85,12 +87,13 @@ func EncodeTuple(t Tuple6) string { if t.IsTor { tor = "1" } - return t.Website + "\x00" + t.Prefix + "\x00" + t.URI + "\x00" + t.Status + "\x00" + tor + "\x00" + strconv.Itoa(int(t.ASN)) + return t.Website + "\x00" + t.Prefix + "\x00" + t.URI + "\x00" + t.Status + "\x00" + tor + "\x00" + strconv.Itoa(int(t.ASN)) + "\x00" + t.SourceTag } // LabelTuple decodes a NUL-separated snapshot label back into a Tuple6. +// Labels from older snapshots (6 fields) round-trip with SourceTag=="". func LabelTuple(label string) Tuple6 { - parts := splitN(label, '\x00', 6) + parts := splitN(label, '\x00', 7) if len(parts) < 4 { return Tuple6{} } @@ -98,11 +101,14 @@ func LabelTuple(label string) Tuple6 { if len(parts) >= 5 { t.IsTor = parts[4] == "1" } - if len(parts) == 6 { + if len(parts) >= 6 { if n, err := strconv.Atoi(parts[5]); err == nil { t.ASN = int32(n) } } + if len(parts) == 7 { + t.SourceTag = parts[6] + } return t } @@ -239,6 +245,9 @@ func MatchesFilter(t Tuple6, f *CompiledFilter) bool { if p.AsnNumber != nil && !matchesAsnOp(t.ASN, p.GetAsnNumber(), p.AsnOp) { return false } + if p.IpngSourceTag != nil && t.SourceTag != p.GetIpngSourceTag() { + return false + } return true } @@ -299,6 +308,8 @@ func DimensionLabel(t Tuple6, g pb.GroupBy) string { return t.Status case pb.GroupBy_ASN_NUMBER: return strconv.Itoa(int(t.ASN)) + case pb.GroupBy_SOURCE_TAG: + return t.SourceTag default: return t.Website } diff --git a/internal/store/store_test.go b/internal/store/store_test.go index 6fac42e..c1effc1 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -335,3 +335,45 @@ func TestDimensionLabelASN(t *testing.T) { t.Errorf("DimensionLabel ASN: got %q, want %q", got, "12345") } } + +// --- SourceTag label encoding, filtering, and DimensionLabel --- + +func TestEncodeLabelTupleRoundtripWithSourceTag(t *testing.T) { + for _, tag := range []string{"", "direct", "cdn", "tag with spaces"} { + orig := Tuple6{Website: "a.com", Prefix: "1.2.3.0/24", URI: "/x", Status: "200", SourceTag: tag} + got := LabelTuple(EncodeTuple(orig)) + if got != orig { + t.Errorf("roundtrip mismatch for tag=%q: got %+v, want %+v", tag, got, orig) + } + } +} + +func TestLabelTupleBackwardCompatNoSourceTag(t *testing.T) { + // 6-field label (pre-source_tag snapshot) decodes with SourceTag="". + label := "a.com\x001.2.3.0/24\x00/x\x00200\x000\x0012345" + got := LabelTuple(label) + if got.SourceTag != "" { + t.Errorf("expected empty SourceTag for 6-field label, got %q", got.SourceTag) + } + if got.ASN != 12345 { + t.Errorf("expected ASN=12345, got %d", got.ASN) + } +} + +func TestMatchesFilterSourceTag(t *testing.T) { + tag := "cdn" + cf := CompileFilter(&pb.Filter{IpngSourceTag: &tag}) + if !MatchesFilter(Tuple6{SourceTag: "cdn"}, cf) { + t.Fatal("should match equal source_tag") + } + if MatchesFilter(Tuple6{SourceTag: "direct"}, cf) { + t.Fatal("should not match different source_tag") + } +} + +func TestDimensionLabelSourceTag(t *testing.T) { + got := DimensionLabel(Tuple6{SourceTag: "cdn"}, pb.GroupBy_SOURCE_TAG) + if got != "cdn" { + t.Errorf("DimensionLabel SOURCE_TAG: got %q, want %q", got, "cdn") + } +} diff --git a/proto/logtail.proto b/proto/logtail.proto index a44b022..7e4947c 100644 --- a/proto/logtail.proto +++ b/proto/logtail.proto @@ -38,6 +38,7 @@ message Filter { TorFilter tor = 8; // restrict to TOR / non-TOR clients optional int32 asn_number = 9; // filter by client ASN StatusOp asn_op = 10; // operator for asn_number; ignored when unset + optional string ipng_source_tag = 13; // filter by nginx source tag } enum GroupBy { @@ -46,6 +47,7 @@ enum GroupBy { REQUEST_URI = 2; HTTP_RESPONSE = 3; ASN_NUMBER = 4; + SOURCE_TAG = 5; } enum Window { diff --git a/proto/logtailpb/logtail.pb.go b/proto/logtailpb/logtail.pb.go index 4092b34..0c2e3a2 100644 --- a/proto/logtailpb/logtail.pb.go +++ b/proto/logtailpb/logtail.pb.go @@ -2,7 +2,7 @@ // versions: // protoc-gen-go v1.36.11 // protoc v3.21.12 -// source: proto/logtail.proto +// source: logtail.proto package logtailpb @@ -56,11 +56,11 @@ func (x TorFilter) String() string { } func (TorFilter) Descriptor() protoreflect.EnumDescriptor { - return file_proto_logtail_proto_enumTypes[0].Descriptor() + return file_logtail_proto_enumTypes[0].Descriptor() } func (TorFilter) Type() protoreflect.EnumType { - return &file_proto_logtail_proto_enumTypes[0] + return &file_logtail_proto_enumTypes[0] } func (x TorFilter) Number() protoreflect.EnumNumber { @@ -69,7 +69,7 @@ func (x TorFilter) Number() protoreflect.EnumNumber { // Deprecated: Use TorFilter.Descriptor instead. func (TorFilter) EnumDescriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{0} + return file_logtail_proto_rawDescGZIP(), []int{0} } // StatusOp is the comparison operator applied to http_response in a Filter. @@ -116,11 +116,11 @@ func (x StatusOp) String() string { } func (StatusOp) Descriptor() protoreflect.EnumDescriptor { - return file_proto_logtail_proto_enumTypes[1].Descriptor() + return file_logtail_proto_enumTypes[1].Descriptor() } func (StatusOp) Type() protoreflect.EnumType { - return &file_proto_logtail_proto_enumTypes[1] + return &file_logtail_proto_enumTypes[1] } func (x StatusOp) Number() protoreflect.EnumNumber { @@ -129,7 +129,7 @@ func (x StatusOp) Number() protoreflect.EnumNumber { // Deprecated: Use StatusOp.Descriptor instead. func (StatusOp) EnumDescriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{1} + return file_logtail_proto_rawDescGZIP(), []int{1} } type GroupBy int32 @@ -140,6 +140,7 @@ const ( GroupBy_REQUEST_URI GroupBy = 2 GroupBy_HTTP_RESPONSE GroupBy = 3 GroupBy_ASN_NUMBER GroupBy = 4 + GroupBy_SOURCE_TAG GroupBy = 5 ) // Enum value maps for GroupBy. @@ -150,6 +151,7 @@ var ( 2: "REQUEST_URI", 3: "HTTP_RESPONSE", 4: "ASN_NUMBER", + 5: "SOURCE_TAG", } GroupBy_value = map[string]int32{ "WEBSITE": 0, @@ -157,6 +159,7 @@ var ( "REQUEST_URI": 2, "HTTP_RESPONSE": 3, "ASN_NUMBER": 4, + "SOURCE_TAG": 5, } ) @@ -171,11 +174,11 @@ func (x GroupBy) String() string { } func (GroupBy) Descriptor() protoreflect.EnumDescriptor { - return file_proto_logtail_proto_enumTypes[2].Descriptor() + return file_logtail_proto_enumTypes[2].Descriptor() } func (GroupBy) Type() protoreflect.EnumType { - return &file_proto_logtail_proto_enumTypes[2] + return &file_logtail_proto_enumTypes[2] } func (x GroupBy) Number() protoreflect.EnumNumber { @@ -184,7 +187,7 @@ func (x GroupBy) Number() protoreflect.EnumNumber { // Deprecated: Use GroupBy.Descriptor instead. func (GroupBy) EnumDescriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{2} + return file_logtail_proto_rawDescGZIP(), []int{2} } type Window int32 @@ -229,11 +232,11 @@ func (x Window) String() string { } func (Window) Descriptor() protoreflect.EnumDescriptor { - return file_proto_logtail_proto_enumTypes[3].Descriptor() + return file_logtail_proto_enumTypes[3].Descriptor() } func (Window) Type() protoreflect.EnumType { - return &file_proto_logtail_proto_enumTypes[3] + return &file_logtail_proto_enumTypes[3] } func (x Window) Number() protoreflect.EnumNumber { @@ -242,7 +245,7 @@ func (x Window) Number() protoreflect.EnumNumber { // Deprecated: Use Window.Descriptor instead. func (Window) EnumDescriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{3} + return file_logtail_proto_rawDescGZIP(), []int{3} } // Filter restricts results to entries matching all specified fields. @@ -261,13 +264,14 @@ type Filter struct { Tor TorFilter `protobuf:"varint,8,opt,name=tor,proto3,enum=logtail.TorFilter" json:"tor,omitempty"` // restrict to TOR / non-TOR clients AsnNumber *int32 `protobuf:"varint,9,opt,name=asn_number,json=asnNumber,proto3,oneof" json:"asn_number,omitempty"` // filter by client ASN AsnOp StatusOp `protobuf:"varint,10,opt,name=asn_op,json=asnOp,proto3,enum=logtail.StatusOp" json:"asn_op,omitempty"` // operator for asn_number; ignored when unset + IpngSourceTag *string `protobuf:"bytes,13,opt,name=ipng_source_tag,json=ipngSourceTag,proto3,oneof" json:"ipng_source_tag,omitempty"` // filter by nginx source tag unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } func (x *Filter) Reset() { *x = Filter{} - mi := &file_proto_logtail_proto_msgTypes[0] + mi := &file_logtail_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -279,7 +283,7 @@ func (x *Filter) String() string { func (*Filter) ProtoMessage() {} func (x *Filter) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[0] + mi := &file_logtail_proto_msgTypes[0] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -292,7 +296,7 @@ func (x *Filter) ProtoReflect() protoreflect.Message { // Deprecated: Use Filter.ProtoReflect.Descriptor instead. func (*Filter) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{0} + return file_logtail_proto_rawDescGZIP(), []int{0} } func (x *Filter) GetWebsite() string { @@ -379,6 +383,13 @@ func (x *Filter) GetAsnOp() StatusOp { return StatusOp_EQ } +func (x *Filter) GetIpngSourceTag() string { + if x != nil && x.IpngSourceTag != nil { + return *x.IpngSourceTag + } + return "" +} + type TopNRequest struct { state protoimpl.MessageState `protogen:"open.v1"` Filter *Filter `protobuf:"bytes,1,opt,name=filter,proto3" json:"filter,omitempty"` @@ -391,7 +402,7 @@ type TopNRequest struct { func (x *TopNRequest) Reset() { *x = TopNRequest{} - mi := &file_proto_logtail_proto_msgTypes[1] + mi := &file_logtail_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -403,7 +414,7 @@ func (x *TopNRequest) String() string { func (*TopNRequest) ProtoMessage() {} func (x *TopNRequest) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[1] + mi := &file_logtail_proto_msgTypes[1] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -416,7 +427,7 @@ func (x *TopNRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use TopNRequest.ProtoReflect.Descriptor instead. func (*TopNRequest) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{1} + return file_logtail_proto_rawDescGZIP(), []int{1} } func (x *TopNRequest) GetFilter() *Filter { @@ -457,7 +468,7 @@ type TopNEntry struct { func (x *TopNEntry) Reset() { *x = TopNEntry{} - mi := &file_proto_logtail_proto_msgTypes[2] + mi := &file_logtail_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -469,7 +480,7 @@ func (x *TopNEntry) String() string { func (*TopNEntry) ProtoMessage() {} func (x *TopNEntry) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[2] + mi := &file_logtail_proto_msgTypes[2] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -482,7 +493,7 @@ func (x *TopNEntry) ProtoReflect() protoreflect.Message { // Deprecated: Use TopNEntry.ProtoReflect.Descriptor instead. func (*TopNEntry) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{2} + return file_logtail_proto_rawDescGZIP(), []int{2} } func (x *TopNEntry) GetLabel() string { @@ -509,7 +520,7 @@ type TopNResponse struct { func (x *TopNResponse) Reset() { *x = TopNResponse{} - mi := &file_proto_logtail_proto_msgTypes[3] + mi := &file_logtail_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -521,7 +532,7 @@ func (x *TopNResponse) String() string { func (*TopNResponse) ProtoMessage() {} func (x *TopNResponse) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[3] + mi := &file_logtail_proto_msgTypes[3] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -534,7 +545,7 @@ func (x *TopNResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use TopNResponse.ProtoReflect.Descriptor instead. func (*TopNResponse) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{3} + return file_logtail_proto_rawDescGZIP(), []int{3} } func (x *TopNResponse) GetEntries() []*TopNEntry { @@ -561,7 +572,7 @@ type TrendRequest struct { func (x *TrendRequest) Reset() { *x = TrendRequest{} - mi := &file_proto_logtail_proto_msgTypes[4] + mi := &file_logtail_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -573,7 +584,7 @@ func (x *TrendRequest) String() string { func (*TrendRequest) ProtoMessage() {} func (x *TrendRequest) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[4] + mi := &file_logtail_proto_msgTypes[4] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -586,7 +597,7 @@ func (x *TrendRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use TrendRequest.ProtoReflect.Descriptor instead. func (*TrendRequest) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{4} + return file_logtail_proto_rawDescGZIP(), []int{4} } func (x *TrendRequest) GetFilter() *Filter { @@ -613,7 +624,7 @@ type TrendPoint struct { func (x *TrendPoint) Reset() { *x = TrendPoint{} - mi := &file_proto_logtail_proto_msgTypes[5] + mi := &file_logtail_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -625,7 +636,7 @@ func (x *TrendPoint) String() string { func (*TrendPoint) ProtoMessage() {} func (x *TrendPoint) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[5] + mi := &file_logtail_proto_msgTypes[5] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -638,7 +649,7 @@ func (x *TrendPoint) ProtoReflect() protoreflect.Message { // Deprecated: Use TrendPoint.ProtoReflect.Descriptor instead. func (*TrendPoint) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{5} + return file_logtail_proto_rawDescGZIP(), []int{5} } func (x *TrendPoint) GetTimestampUnix() int64 { @@ -665,7 +676,7 @@ type TrendResponse struct { func (x *TrendResponse) Reset() { *x = TrendResponse{} - mi := &file_proto_logtail_proto_msgTypes[6] + mi := &file_logtail_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -677,7 +688,7 @@ func (x *TrendResponse) String() string { func (*TrendResponse) ProtoMessage() {} func (x *TrendResponse) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[6] + mi := &file_logtail_proto_msgTypes[6] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -690,7 +701,7 @@ func (x *TrendResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use TrendResponse.ProtoReflect.Descriptor instead. func (*TrendResponse) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{6} + return file_logtail_proto_rawDescGZIP(), []int{6} } func (x *TrendResponse) GetPoints() []*TrendPoint { @@ -715,7 +726,7 @@ type SnapshotRequest struct { func (x *SnapshotRequest) Reset() { *x = SnapshotRequest{} - mi := &file_proto_logtail_proto_msgTypes[7] + mi := &file_logtail_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -727,7 +738,7 @@ func (x *SnapshotRequest) String() string { func (*SnapshotRequest) ProtoMessage() {} func (x *SnapshotRequest) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[7] + mi := &file_logtail_proto_msgTypes[7] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -740,7 +751,7 @@ func (x *SnapshotRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use SnapshotRequest.ProtoReflect.Descriptor instead. func (*SnapshotRequest) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{7} + return file_logtail_proto_rawDescGZIP(), []int{7} } type Snapshot struct { @@ -755,7 +766,7 @@ type Snapshot struct { func (x *Snapshot) Reset() { *x = Snapshot{} - mi := &file_proto_logtail_proto_msgTypes[8] + mi := &file_logtail_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -767,7 +778,7 @@ func (x *Snapshot) String() string { func (*Snapshot) ProtoMessage() {} func (x *Snapshot) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[8] + mi := &file_logtail_proto_msgTypes[8] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -780,7 +791,7 @@ func (x *Snapshot) ProtoReflect() protoreflect.Message { // Deprecated: Use Snapshot.ProtoReflect.Descriptor instead. func (*Snapshot) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{8} + return file_logtail_proto_rawDescGZIP(), []int{8} } func (x *Snapshot) GetSource() string { @@ -819,7 +830,7 @@ type DumpSnapshotsRequest struct { func (x *DumpSnapshotsRequest) Reset() { *x = DumpSnapshotsRequest{} - mi := &file_proto_logtail_proto_msgTypes[9] + mi := &file_logtail_proto_msgTypes[9] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -831,7 +842,7 @@ func (x *DumpSnapshotsRequest) String() string { func (*DumpSnapshotsRequest) ProtoMessage() {} func (x *DumpSnapshotsRequest) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[9] + mi := &file_logtail_proto_msgTypes[9] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -844,7 +855,7 @@ func (x *DumpSnapshotsRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use DumpSnapshotsRequest.ProtoReflect.Descriptor instead. func (*DumpSnapshotsRequest) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{9} + return file_logtail_proto_rawDescGZIP(), []int{9} } type ListTargetsRequest struct { @@ -855,7 +866,7 @@ type ListTargetsRequest struct { func (x *ListTargetsRequest) Reset() { *x = ListTargetsRequest{} - mi := &file_proto_logtail_proto_msgTypes[10] + mi := &file_logtail_proto_msgTypes[10] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -867,7 +878,7 @@ func (x *ListTargetsRequest) String() string { func (*ListTargetsRequest) ProtoMessage() {} func (x *ListTargetsRequest) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[10] + mi := &file_logtail_proto_msgTypes[10] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -880,7 +891,7 @@ func (x *ListTargetsRequest) ProtoReflect() protoreflect.Message { // Deprecated: Use ListTargetsRequest.ProtoReflect.Descriptor instead. func (*ListTargetsRequest) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{10} + return file_logtail_proto_rawDescGZIP(), []int{10} } type TargetInfo struct { @@ -893,7 +904,7 @@ type TargetInfo struct { func (x *TargetInfo) Reset() { *x = TargetInfo{} - mi := &file_proto_logtail_proto_msgTypes[11] + mi := &file_logtail_proto_msgTypes[11] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -905,7 +916,7 @@ func (x *TargetInfo) String() string { func (*TargetInfo) ProtoMessage() {} func (x *TargetInfo) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[11] + mi := &file_logtail_proto_msgTypes[11] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -918,7 +929,7 @@ func (x *TargetInfo) ProtoReflect() protoreflect.Message { // Deprecated: Use TargetInfo.ProtoReflect.Descriptor instead. func (*TargetInfo) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{11} + return file_logtail_proto_rawDescGZIP(), []int{11} } func (x *TargetInfo) GetName() string { @@ -944,7 +955,7 @@ type ListTargetsResponse struct { func (x *ListTargetsResponse) Reset() { *x = ListTargetsResponse{} - mi := &file_proto_logtail_proto_msgTypes[12] + mi := &file_logtail_proto_msgTypes[12] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -956,7 +967,7 @@ func (x *ListTargetsResponse) String() string { func (*ListTargetsResponse) ProtoMessage() {} func (x *ListTargetsResponse) ProtoReflect() protoreflect.Message { - mi := &file_proto_logtail_proto_msgTypes[12] + mi := &file_logtail_proto_msgTypes[12] if x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -969,7 +980,7 @@ func (x *ListTargetsResponse) ProtoReflect() protoreflect.Message { // Deprecated: Use ListTargetsResponse.ProtoReflect.Descriptor instead. func (*ListTargetsResponse) Descriptor() ([]byte, []int) { - return file_proto_logtail_proto_rawDescGZIP(), []int{12} + return file_logtail_proto_rawDescGZIP(), []int{12} } func (x *ListTargetsResponse) GetTargets() []*TargetInfo { @@ -979,11 +990,11 @@ func (x *ListTargetsResponse) GetTargets() []*TargetInfo { return nil } -var File_proto_logtail_proto protoreflect.FileDescriptor +var File_logtail_proto protoreflect.FileDescriptor -const file_proto_logtail_proto_rawDesc = "" + +const file_logtail_proto_rawDesc = "" + "\n" + - "\x13proto/logtail.proto\x12\alogtail\"\xa8\x05\n" + + "\rlogtail.proto\x12\alogtail\"\xe9\x05\n" + "\x06Filter\x12\x1d\n" + "\awebsite\x18\x01 \x01(\tH\x00R\awebsite\x88\x01\x01\x12(\n" + "\rclient_prefix\x18\x02 \x01(\tH\x01R\fclientPrefix\x88\x01\x01\x12-\n" + @@ -998,7 +1009,8 @@ const file_proto_logtail_proto_rawDesc = "" + "\n" + "asn_number\x18\t \x01(\x05H\bR\tasnNumber\x88\x01\x01\x12(\n" + "\x06asn_op\x18\n" + - " \x01(\x0e2\x11.logtail.StatusOpR\x05asnOpB\n" + + " \x01(\x0e2\x11.logtail.StatusOpR\x05asnOp\x12+\n" + + "\x0fipng_source_tag\x18\r \x01(\tH\tR\ripngSourceTag\x88\x01\x01B\n" + "\n" + "\b_websiteB\x10\n" + "\x0e_client_prefixB\x13\n" + @@ -1009,7 +1021,8 @@ const file_proto_logtail_proto_rawDesc = "" + "_uri_regexB\x18\n" + "\x16_website_regex_excludeB\x14\n" + "\x12_uri_regex_excludeB\r\n" + - "\v_asn_number\"\x9a\x01\n" + + "\v_asn_numberB\x12\n" + + "\x10_ipng_source_tag\"\x9a\x01\n" + "\vTopNRequest\x12'\n" + "\x06filter\x18\x01 \x01(\v2\x0f.logtail.FilterR\x06filter\x12+\n" + "\bgroup_by\x18\x02 \x01(\x0e2\x10.logtail.GroupByR\agroupBy\x12\f\n" + @@ -1056,14 +1069,16 @@ const file_proto_logtail_proto_rawDesc = "" + "\x02GT\x10\x02\x12\x06\n" + "\x02GE\x10\x03\x12\x06\n" + "\x02LT\x10\x04\x12\x06\n" + - "\x02LE\x10\x05*]\n" + + "\x02LE\x10\x05*m\n" + "\aGroupBy\x12\v\n" + "\aWEBSITE\x10\x00\x12\x11\n" + "\rCLIENT_PREFIX\x10\x01\x12\x0f\n" + "\vREQUEST_URI\x10\x02\x12\x11\n" + "\rHTTP_RESPONSE\x10\x03\x12\x0e\n" + "\n" + - "ASN_NUMBER\x10\x04*A\n" + + "ASN_NUMBER\x10\x04\x12\x0e\n" + + "\n" + + "SOURCE_TAG\x10\x05*A\n" + "\x06Window\x12\a\n" + "\x03W1M\x10\x00\x12\a\n" + "\x03W5M\x10\x01\x12\b\n" + @@ -1079,20 +1094,20 @@ const file_proto_logtail_proto_rawDesc = "" + "\rDumpSnapshots\x12\x1d.logtail.DumpSnapshotsRequest\x1a\x11.logtail.Snapshot0\x01B0Z.git.ipng.ch/ipng/nginx-logtail/proto/logtailpbb\x06proto3" var ( - file_proto_logtail_proto_rawDescOnce sync.Once - file_proto_logtail_proto_rawDescData []byte + file_logtail_proto_rawDescOnce sync.Once + file_logtail_proto_rawDescData []byte ) -func file_proto_logtail_proto_rawDescGZIP() []byte { - file_proto_logtail_proto_rawDescOnce.Do(func() { - file_proto_logtail_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_proto_logtail_proto_rawDesc), len(file_proto_logtail_proto_rawDesc))) +func file_logtail_proto_rawDescGZIP() []byte { + file_logtail_proto_rawDescOnce.Do(func() { + file_logtail_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_logtail_proto_rawDesc), len(file_logtail_proto_rawDesc))) }) - return file_proto_logtail_proto_rawDescData + return file_logtail_proto_rawDescData } -var file_proto_logtail_proto_enumTypes = make([]protoimpl.EnumInfo, 4) -var file_proto_logtail_proto_msgTypes = make([]protoimpl.MessageInfo, 13) -var file_proto_logtail_proto_goTypes = []any{ +var file_logtail_proto_enumTypes = make([]protoimpl.EnumInfo, 4) +var file_logtail_proto_msgTypes = make([]protoimpl.MessageInfo, 13) +var file_logtail_proto_goTypes = []any{ (TorFilter)(0), // 0: logtail.TorFilter (StatusOp)(0), // 1: logtail.StatusOp (GroupBy)(0), // 2: logtail.GroupBy @@ -1111,7 +1126,7 @@ var file_proto_logtail_proto_goTypes = []any{ (*TargetInfo)(nil), // 15: logtail.TargetInfo (*ListTargetsResponse)(nil), // 16: logtail.ListTargetsResponse } -var file_proto_logtail_proto_depIdxs = []int32{ +var file_logtail_proto_depIdxs = []int32{ 1, // 0: logtail.Filter.status_op:type_name -> logtail.StatusOp 0, // 1: logtail.Filter.tor:type_name -> logtail.TorFilter 1, // 2: logtail.Filter.asn_op:type_name -> logtail.StatusOp @@ -1141,28 +1156,28 @@ var file_proto_logtail_proto_depIdxs = []int32{ 0, // [0:12] is the sub-list for field type_name } -func init() { file_proto_logtail_proto_init() } -func file_proto_logtail_proto_init() { - if File_proto_logtail_proto != nil { +func init() { file_logtail_proto_init() } +func file_logtail_proto_init() { + if File_logtail_proto != nil { return } - file_proto_logtail_proto_msgTypes[0].OneofWrappers = []any{} + file_logtail_proto_msgTypes[0].OneofWrappers = []any{} type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: unsafe.Slice(unsafe.StringData(file_proto_logtail_proto_rawDesc), len(file_proto_logtail_proto_rawDesc)), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_logtail_proto_rawDesc), len(file_logtail_proto_rawDesc)), NumEnums: 4, NumMessages: 13, NumExtensions: 0, NumServices: 1, }, - GoTypes: file_proto_logtail_proto_goTypes, - DependencyIndexes: file_proto_logtail_proto_depIdxs, - EnumInfos: file_proto_logtail_proto_enumTypes, - MessageInfos: file_proto_logtail_proto_msgTypes, + GoTypes: file_logtail_proto_goTypes, + DependencyIndexes: file_logtail_proto_depIdxs, + EnumInfos: file_logtail_proto_enumTypes, + MessageInfos: file_logtail_proto_msgTypes, }.Build() - File_proto_logtail_proto = out.File - file_proto_logtail_proto_goTypes = nil - file_proto_logtail_proto_depIdxs = nil + File_logtail_proto = out.File + file_logtail_proto_goTypes = nil + file_logtail_proto_depIdxs = nil } diff --git a/proto/logtailpb/logtail_grpc.pb.go b/proto/logtailpb/logtail_grpc.pb.go index 66a48bd..432c59f 100644 --- a/proto/logtailpb/logtail_grpc.pb.go +++ b/proto/logtailpb/logtail_grpc.pb.go @@ -2,7 +2,7 @@ // versions: // - protoc-gen-go-grpc v1.6.1 // - protoc v3.21.12 -// source: proto/logtail.proto +// source: logtail.proto package logtailpb @@ -276,5 +276,5 @@ var LogtailService_ServiceDesc = grpc.ServiceDesc{ ServerStreams: true, }, }, - Metadata: "proto/logtail.proto", + Metadata: "logtail.proto", }