From b2cb44b9662a5653bd91decb1245f909a67a2ff6 Mon Sep 17 00:00:00 2001 From: Syed Azeez <67051473+syedazeez337@users.noreply.github.com> Date: Wed, 6 May 2026 13:02:28 +0530 Subject: [PATCH] feat(cache): add optional verify timeout to serve_stale (#8070) --- plugin/cache/README.md | 6 +- plugin/cache/cache.go | 5 +- plugin/cache/cache_test.go | 119 +++++++++++++++++++++++++++++++++++++ plugin/cache/handler.go | 55 ++++++++++++++++- plugin/cache/setup.go | 16 ++++- plugin/cache/setup_test.go | 48 +++++++++------ 6 files changed, 226 insertions(+), 23 deletions(-) diff --git a/plugin/cache/README.md b/plugin/cache/README.md index 90dfaa6e3..119f5290f 100644 --- a/plugin/cache/README.md +++ b/plugin/cache/README.md @@ -40,7 +40,7 @@ cache [TTL] [ZONES...] { success CAPACITY [TTL] [MINTTL] denial CAPACITY [TTL] [MINTTL] prefetch AMOUNT [[DURATION] [PERCENTAGE%]] - serve_stale [DURATION] [REFRESH_MODE] + serve_stale [DURATION] [REFRESH_MODE [VERIFY_TIMEOUT]] servfail DURATION disable success|denial [ZONES...] keepttl @@ -74,6 +74,10 @@ cache [TTL] [ZONES...] { from ever being served if an updated response can be retrieved from the source. In `immediate` mode, concurrent requests for the same expired entry dispatch at most one background refresh. + **VERIFY_TIMEOUT** is only valid with `verify` and bounds how long the cache waits for the upstream + verify before falling back to the stale entry. The verify continues in the background and refreshes the + cache when it eventually succeeds, so subsequent queries see the fresh entry. The default of `0` means + wait until the upstream's own timeout (the original `verify` behavior). Example: `serve_stale 1h verify 100ms`. * `servfail` cache SERVFAIL responses for **DURATION**. Setting **DURATION** to 0 will disable caching of SERVFAIL responses. If this option is not set, SERVFAIL responses will be cached for 5 seconds. **DURATION** may not be greater than 5 minutes. diff --git a/plugin/cache/cache.go b/plugin/cache/cache.go index 04eb3283a..15312a1a6 100644 --- a/plugin/cache/cache.go +++ b/plugin/cache/cache.go @@ -42,8 +42,9 @@ type Cache struct { percentage int // Stale serve - staleUpTo time.Duration - verifyStale bool + staleUpTo time.Duration + verifyStale bool + verifyStaleTimeout time.Duration // 0 means wait for upstream until its own timeout (current default). // Positive/negative zone exceptions pexcept []string diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go index 304a14c64..22af618ec 100644 --- a/plugin/cache/cache_test.go +++ b/plugin/cache/cache_test.go @@ -501,6 +501,103 @@ func TestServeFromStaleCacheFetchVerify(t *testing.T) { } } +func TestServeFromStaleCacheFetchVerifyTimeout(t *testing.T) { + // Verify that when verifyStaleTimeout is set and the upstream is slow, + // the client gets the stale entry within ~timeout, while the in-flight + // verify continues in the background and refreshes the cache. + c := New() + c.staleUpTo = 1 * time.Hour + c.verifyStale = true + c.verifyStaleTimeout = 50 * time.Millisecond + c.Next = ttlBackend(120) + + req := new(dns.Msg) + req.SetQuestion("cached.org.", dns.TypeA) + ctx := context.TODO() + + // Prime the cache with a 120s TTL entry. + rec := dnstest.NewRecorder(&test.ResponseWriter{}) + c.ServeDNS(ctx, rec, req) + if c.pcache.Len() != 1 { + t.Fatalf("Msg with > 0 TTL should have been cached") + } + + // Move forward past the TTL so the entry is stale. + c.now = func() time.Time { return time.Now().Add(3 * time.Minute) } + + // Swap in a slow backend that takes longer than the verify timeout. + bgDone := make(chan struct{}) + c.Next = slowTTLBackend(60, 200*time.Millisecond, bgDone) + + rec = dnstest.NewRecorder(&test.ResponseWriter{}) + start := time.Now() + ret, err := c.ServeDNS(ctx, rec, req.Copy()) + elapsed := time.Since(start) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ret != dns.RcodeSuccess { + t.Fatalf("expected RcodeSuccess, got %d", ret) + } + if elapsed > 150*time.Millisecond { + t.Errorf("expected response within ~timeout (50ms); took %v", elapsed) + } + if rec.Msg == nil || len(rec.Msg.Answer) == 0 { + t.Fatalf("expected an answer, got %+v", rec.Msg) + } + // Stale serve sets TTL to 0. + if got := rec.Msg.Answer[0].Header().Ttl; got != 0 { + t.Errorf("expected stale TTL=0, got %d", got) + } + + // Wait for the background verify to complete. + select { + case <-bgDone: + case <-time.After(2 * time.Second): + t.Fatalf("background verify never completed") + } +} + +func TestServeFromStaleCacheFetchVerifyTimeoutFastUpstream(t *testing.T) { + // When the upstream answers within the verify timeout, the client should + // receive the freshly verified response (not a stale one). + c := New() + c.staleUpTo = 1 * time.Hour + c.verifyStale = true + c.verifyStaleTimeout = 500 * time.Millisecond + c.Next = ttlBackend(120) + + req := new(dns.Msg) + req.SetQuestion("cached.org.", dns.TypeA) + ctx := context.TODO() + + rec := dnstest.NewRecorder(&test.ResponseWriter{}) + c.ServeDNS(ctx, rec, req) + if c.pcache.Len() != 1 { + t.Fatalf("Msg with > 0 TTL should have been cached") + } + + c.now = func() time.Time { return time.Now().Add(3 * time.Minute) } + // Fast upstream returning fresh TTL=200. + c.Next = ttlBackend(200) + + rec = dnstest.NewRecorder(&test.ResponseWriter{}) + ret, err := c.ServeDNS(ctx, rec, req.Copy()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if ret != dns.RcodeSuccess { + t.Fatalf("expected RcodeSuccess, got %d", ret) + } + if rec.Msg == nil || len(rec.Msg.Answer) == 0 { + t.Fatalf("expected an answer, got %+v", rec.Msg) + } + if got := rec.Msg.Answer[0].Header().Ttl; got != 200 { + t.Errorf("expected fresh TTL=200, got %d", got) + } +} + func TestNegativeStaleMaskingPositiveCache(t *testing.T) { c := New() c.staleUpTo = time.Minute * 10 @@ -672,6 +769,28 @@ func servFailBackend(ttl int) plugin.Handler { }) } +// slowTTLBackend wraps ttlBackend with a fixed delay to simulate a slow upstream. +// done is closed once the response is written so callers can synchronise with the +// background goroutine. +func slowTTLBackend(ttl int, delay time.Duration, done chan<- struct{}) plugin.Handler { + return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) { + select { + case <-time.After(delay): + case <-ctx.Done(): + return dns.RcodeServerFailure, ctx.Err() + } + m := new(dns.Msg) + m.SetReply(r) + m.Response, m.RecursionAvailable = true, true + m.Answer = []dns.RR{test.A(fmt.Sprintf("example.org. %d IN A 127.0.0.53", ttl))} + w.WriteMsg(m) + if done != nil { + close(done) + } + return dns.RcodeSuccess, nil + }) +} + func TestComputeTTL(t *testing.T) { tests := []struct { msgTTL time.Duration diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go index 96c65384f..5f561118d 100644 --- a/plugin/cache/handler.go +++ b/plugin/cache/handler.go @@ -45,9 +45,18 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) // serve stale behavior if c.verifyStale { crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do, cd: cd} + if c.verifyStaleTimeout > 0 { + // Background verify: cache the response but do not write to the wire. + // On timeout, we serve the stale entry below and let the goroutine continue. + crr.prefetch = true + } cw := newVerifyStaleResponseWriter(crr) - ret, err := c.doRefresh(ctx, state, cw) - if cw.refreshed { + if c.verifyStaleTimeout == 0 { + ret, err := c.doRefresh(ctx, state, cw) + if cw.refreshed { + return ret, err + } + } else if served, ret, err := c.verifyWithTimeout(ctx, state, w, cw, r, do, ad); served { return ret, err } } @@ -121,6 +130,48 @@ func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.Res return plugin.NextOrFailure(c.Name(), c.Next, ctx, cw, state.Req) } +// verifyWithTimeout runs the upstream verify in a background goroutine and races it +// against verifyStaleTimeout. If the verify completes within the timeout and the +// response is cacheable (NoError or NXDomain), the freshly cached entry is served +// to the client and served is true. Otherwise served is false and the caller falls +// through to serve stale; the goroutine continues to run and any successful response +// will update the cache without writing to the (now-detached) client connection. +func (c *Cache) verifyWithTimeout(ctx context.Context, state request.Request, w dns.ResponseWriter, cw *verifyStaleResponseWriter, r *dns.Msg, do, ad bool) (served bool, code int, err error) { + type result struct { + code int + err error + } + done := make(chan result, 1) + go func() { + rc, re := c.doRefresh(ctx, state, cw) + done <- result{rc, re} + }() + timer := time.NewTimer(c.verifyStaleTimeout) + defer timer.Stop() + select { + case res := <-done: + if !cw.refreshed { + return false, 0, nil + } + fresh := c.exists(state.Name(), state.QType(), state.Do(), state.Req.CheckingDisabled) + if fresh == nil { + // Should not happen: refreshed=true means the upstream response was cacheable. + return true, res.code, res.err + } + now := c.now().UTC() + if c.keepttl { + now = fresh.stored + } + resp := fresh.toMsg(r, now, do, ad) + if err := w.WriteMsg(resp); err != nil { + return true, dns.RcodeServerFailure, err + } + return true, dns.RcodeSuccess, nil + case <-timer.C: + return false, 0, nil + } +} + func (c *Cache) shouldPrefetch(i *item, now time.Time) bool { if c.prefetch <= 0 { return false diff --git a/plugin/cache/setup.go b/plugin/cache/setup.go index 9f18c673c..611fce1c4 100644 --- a/plugin/cache/setup.go +++ b/plugin/cache/setup.go @@ -172,7 +172,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) { case "serve_stale": args := c.RemainingArgs() - if len(args) > 2 { + if len(args) > 3 { return nil, c.ArgErr() } ca.staleUpTo = 1 * time.Hour @@ -187,6 +187,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) { ca.staleUpTo = d } ca.verifyStale = false + ca.verifyStaleTimeout = 0 if len(args) > 1 { mode := strings.ToLower(args[1]) if mode != "immediate" && mode != "verify" { @@ -194,6 +195,19 @@ func cacheParse(c *caddy.Controller) (*Cache, error) { } ca.verifyStale = mode == "verify" } + if len(args) > 2 { + if !ca.verifyStale { + return nil, errors.New("serve_stale timeout is only valid with the verify refresh mode") + } + t, err := time.ParseDuration(args[2]) + if err != nil { + return nil, fmt.Errorf("invalid serve_stale verify timeout: %w", err) + } + if t < 0 { + return nil, errors.New("invalid negative timeout for serve_stale verify") + } + ca.verifyStaleTimeout = t + } case "servfail": args := c.RemainingArgs() if len(args) != 1 { diff --git a/plugin/cache/setup_test.go b/plugin/cache/setup_test.go index 46ac5bd9e..3aaa42921 100644 --- a/plugin/cache/setup_test.go +++ b/plugin/cache/setup_test.go @@ -117,25 +117,33 @@ func TestSetup(t *testing.T) { func TestServeStale(t *testing.T) { tests := []struct { - input string - shouldErr bool - staleUpTo time.Duration - verifyStale bool + input string + shouldErr bool + staleUpTo time.Duration + verifyStale bool + verifyStaleTimeout time.Duration }{ - {"serve_stale", false, 1 * time.Hour, false}, - {"serve_stale 20m", false, 20 * time.Minute, false}, - {"serve_stale 1h20m", false, 80 * time.Minute, false}, - {"serve_stale 0m", false, 0, false}, - {"serve_stale 0", false, 0, false}, - {"serve_stale 0 verify", false, 0, true}, - {"serve_stale 0 immediate", false, 0, false}, - {"serve_stale 0 VERIFY", false, 0, true}, + {"serve_stale", false, 1 * time.Hour, false, 0}, + {"serve_stale 20m", false, 20 * time.Minute, false, 0}, + {"serve_stale 1h20m", false, 80 * time.Minute, false, 0}, + {"serve_stale 0m", false, 0, false, 0}, + {"serve_stale 0", false, 0, false, 0}, + {"serve_stale 0 verify", false, 0, true, 0}, + {"serve_stale 0 immediate", false, 0, false, 0}, + {"serve_stale 0 VERIFY", false, 0, true, 0}, + {"serve_stale 1h verify 100ms", false, 1 * time.Hour, true, 100 * time.Millisecond}, + {"serve_stale 1h verify 0", false, 1 * time.Hour, true, 0}, + {"serve_stale 1h VERIFY 250ms", false, 1 * time.Hour, true, 250 * time.Millisecond}, // fails - {"serve_stale 20", true, 0, false}, - {"serve_stale -20m", true, 0, false}, - {"serve_stale aa", true, 0, false}, - {"serve_stale 1m nono", true, 0, false}, - {"serve_stale 0 after nono", true, 0, false}, + {"serve_stale 20", true, 0, false, 0}, + {"serve_stale -20m", true, 0, false, 0}, + {"serve_stale aa", true, 0, false, 0}, + {"serve_stale 1m nono", true, 0, false, 0}, + {"serve_stale 0 after nono", true, 0, false, 0}, + {"serve_stale 1h immediate 100ms", true, 0, false, 0}, + {"serve_stale 1h verify -1ms", true, 0, false, 0}, + {"serve_stale 1h verify garbage", true, 0, false, 0}, + {"serve_stale 1h verify 100ms extra", true, 0, false, 0}, } for i, test := range tests { c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input)) @@ -153,6 +161,12 @@ func TestServeStale(t *testing.T) { if ca.staleUpTo != test.staleUpTo { t.Errorf("Test %v: Expected stale %v but found: %v", i, test.staleUpTo, ca.staleUpTo) } + if ca.verifyStale != test.verifyStale { + t.Errorf("Test %v: Expected verifyStale %v but found: %v", i, test.verifyStale, ca.verifyStale) + } + if ca.verifyStaleTimeout != test.verifyStaleTimeout { + t.Errorf("Test %v: Expected verifyStaleTimeout %v but found: %v", i, test.verifyStaleTimeout, ca.verifyStaleTimeout) + } } }