Files
nginx-logtail/cmd/collector/parser.go
T
pim 6647f95be4 RELEASE 1.0.1: v2 log format, source_tag-labeled metrics, lint cleanup
Wire-format and metric overhaul. Both file and UDP ingest now share one
versioned ParseLine that dispatches on the v<N>\t prefix; v1 stays
unchanged, v2 adds $bytes_sent (replacing $body_bytes_sent),
$request_length, $upstream_response_time, and $upstream_status. File
ingest gains the same versioning, and the legacy positional file format
is removed (no live deployments).

Prometheus exposition is rewritten:

  - nginx_http_bytes_sent and nginx_http_request_duration_seconds gain
    a source_tag label.
  - nginx_http_requests_by_source_total gains status_class.
  - New v2-only metrics: nginx_http_request_bytes,
    nginx_http_upstream_duration_seconds,
    nginx_http_upstream_requests_total{status_class}.
  - Dropped nginx_http_response_body_bytes_by_source (subsumed by the
    dual-labeled bytes_sent metric).

Adds 'make fixstyle' (gofmt -w) and clears all golangci-lint findings
across the repo (errcheck, S1001, ST1005, unused).

Docs in design.md FR-2/FR-8 and user-guide.md are rewritten to present
v2 as the recommended log format.
2026-05-01 15:40:53 +02:00

189 lines
5.2 KiB
Go

package main
import (
"fmt"
"net"
"strconv"
"strings"
)
// LogRecord holds the dimensions extracted from a single nginx log line.
//
// BytesSent carries $body_bytes_sent for v1 records and $bytes_sent for v2
// records — operators see a small step up when emitters move to v2 because v2
// includes header overhead.
//
// RequestLength, UpstreamResponseTime, UpstreamStatus, HasUpstream are v2-only.
// In v1 records HasUpstream is always false and the related fields are zero.
type LogRecord struct {
Website string
ClientPrefix string
URI string
Status string
IsTor bool
ASN int32
Method string
BytesSent int64
RequestLength int64
RequestTime float64
UpstreamResponseTime float64
UpstreamStatus string
HasUpstream bool
SourceTag string
}
// ParseLine parses a versioned nginx-logtail line. Both file ingest and UDP
// ingest funnel through here. Every line MUST start with "v<N>\t"; unknown or
// missing versions return false so operators can ship a parser update before
// the emitter switches.
func ParseLine(line string, v4bits, v6bits int) (LogRecord, bool) {
i := strings.IndexByte(line, '\t')
if i < 0 {
return LogRecord{}, false
}
switch line[:i] {
case "v1":
return parseV1(line[i+1:], v4bits, v6bits)
case "v2":
return parseV2(line[i+1:], v4bits, v6bits)
default:
return LogRecord{}, false
}
}
// parseV1 parses the v1 payload (12 tab-separated fields):
//
// $host \t $remote_addr \t $request_method \t $request_uri \t $status \t
// $body_bytes_sent \t $request_time \t $is_tor \t $asn \t
// $ipng_source_tag \t $server_addr \t $scheme
//
// $server_addr and $scheme are parsed but discarded.
func parseV1(payload string, v4bits, v6bits int) (LogRecord, bool) {
fields := strings.Split(payload, "\t")
if len(fields) != 12 {
return LogRecord{}, false
}
prefix, ok := truncateIP(fields[1], v4bits, v6bits)
if !ok {
return LogRecord{}, false
}
var asn int32
if n, err := strconv.ParseInt(fields[8], 10, 32); err == nil {
asn = int32(n)
}
return LogRecord{
Website: fields[0],
ClientPrefix: prefix,
URI: stripQuery(fields[3]),
Status: fields[4],
IsTor: fields[7] == "1",
ASN: asn,
Method: fields[2],
BytesSent: parseInt(fields[5]),
RequestTime: parseFloat(fields[6]),
SourceTag: fields[9],
}, true
}
// parseV2 parses the v2 payload (15 tab-separated fields):
//
// $host \t $remote_addr \t $request_method \t $request_uri \t $status \t
// $bytes_sent \t $request_length \t $request_time \t
// $upstream_response_time \t $upstream_status \t
// $is_tor \t $asn \t $ipng_source_tag \t $server_addr \t $scheme
//
// $upstream_response_time and $upstream_status are "-" (or empty) when nginx
// served the response directly — HasUpstream is left false in that case.
// When nginx retried across multiple upstreams the fields are comma-separated;
// the parser keeps the last entry, since that's the upstream that actually
// served the response. $server_addr and $scheme are parsed but discarded.
func parseV2(payload string, v4bits, v6bits int) (LogRecord, bool) {
fields := strings.Split(payload, "\t")
if len(fields) != 15 {
return LogRecord{}, false
}
prefix, ok := truncateIP(fields[1], v4bits, v6bits)
if !ok {
return LogRecord{}, false
}
var asn int32
if n, err := strconv.ParseInt(fields[11], 10, 32); err == nil {
asn = int32(n)
}
r := LogRecord{
Website: fields[0],
ClientPrefix: prefix,
URI: stripQuery(fields[3]),
Status: fields[4],
IsTor: fields[10] == "1",
ASN: asn,
Method: fields[2],
BytesSent: parseInt(fields[5]),
RequestLength: parseInt(fields[6]),
RequestTime: parseFloat(fields[7]),
SourceTag: fields[12],
}
if fields[8] != "-" && fields[8] != "" {
timeStr := lastCommaPart(fields[8])
statusStr := lastCommaPart(fields[9])
if t, err := strconv.ParseFloat(timeStr, 64); err == nil {
r.UpstreamResponseTime = t
r.UpstreamStatus = statusStr
r.HasUpstream = true
}
}
return r, true
}
// lastCommaPart returns the substring after the last ", " (nginx's separator
// for retried upstreams). Plain values pass through unchanged.
func lastCommaPart(s string) string {
if i := strings.LastIndex(s, ", "); i >= 0 {
return s[i+2:]
}
return s
}
func stripQuery(uri string) string {
if i := strings.IndexByte(uri, '?'); i >= 0 {
return uri[:i]
}
return uri
}
func parseInt(s string) int64 {
n, _ := strconv.ParseInt(s, 10, 64)
return n
}
func parseFloat(s string) float64 {
f, _ := strconv.ParseFloat(s, 64)
return f
}
// truncateIP masks addr to the given prefix length depending on IP version.
// Returns the CIDR string (e.g. "1.2.3.0/24") and true on success.
func truncateIP(addr string, v4bits, v6bits int) (string, bool) {
ip := net.ParseIP(addr)
if ip == nil {
return "", false
}
var bits int
if ip.To4() != nil {
ip = ip.To4()
bits = v4bits
} else {
ip = ip.To16()
bits = v6bits
}
mask := net.CIDRMask(bits, len(ip)*8)
masked := make(net.IP, len(ip))
for i := range ip {
masked[i] = ip[i] & mask[i]
}
return fmt.Sprintf("%s/%d", masked.String(), bits), true
}