feat(cache): add optional verify timeout to serve_stale (#8070)

This commit is contained in:
Syed Azeez
2026-05-06 13:02:28 +05:30
committed by GitHub
parent 145029c847
commit b2cb44b966
6 changed files with 226 additions and 23 deletions

View File

@@ -40,7 +40,7 @@ cache [TTL] [ZONES...] {
success CAPACITY [TTL] [MINTTL] success CAPACITY [TTL] [MINTTL]
denial CAPACITY [TTL] [MINTTL] denial CAPACITY [TTL] [MINTTL]
prefetch AMOUNT [[DURATION] [PERCENTAGE%]] prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
serve_stale [DURATION] [REFRESH_MODE] serve_stale [DURATION] [REFRESH_MODE [VERIFY_TIMEOUT]]
servfail DURATION servfail DURATION
disable success|denial [ZONES...] disable success|denial [ZONES...]
keepttl keepttl
@@ -74,6 +74,10 @@ cache [TTL] [ZONES...] {
from ever being served if an updated response can be retrieved from the source. from ever being served if an updated response can be retrieved from the source.
In `immediate` mode, concurrent requests for the same expired entry dispatch at most one In `immediate` mode, concurrent requests for the same expired entry dispatch at most one
background refresh. background refresh.
**VERIFY_TIMEOUT** is only valid with `verify` and bounds how long the cache waits for the upstream
verify before falling back to the stale entry. The verify continues in the background and refreshes the
cache when it eventually succeeds, so subsequent queries see the fresh entry. The default of `0` means
wait until the upstream's own timeout (the original `verify` behavior). Example: `serve_stale 1h verify 100ms`.
* `servfail` cache SERVFAIL responses for **DURATION**. Setting **DURATION** to 0 will disable caching of SERVFAIL * `servfail` cache SERVFAIL responses for **DURATION**. Setting **DURATION** to 0 will disable caching of SERVFAIL
responses. If this option is not set, SERVFAIL responses will be cached for 5 seconds. **DURATION** may not be responses. If this option is not set, SERVFAIL responses will be cached for 5 seconds. **DURATION** may not be
greater than 5 minutes. greater than 5 minutes.

View File

@@ -42,8 +42,9 @@ type Cache struct {
percentage int percentage int
// Stale serve // Stale serve
staleUpTo time.Duration staleUpTo time.Duration
verifyStale bool verifyStale bool
verifyStaleTimeout time.Duration // 0 means wait for upstream until its own timeout (current default).
// Positive/negative zone exceptions // Positive/negative zone exceptions
pexcept []string pexcept []string

View File

@@ -501,6 +501,103 @@ func TestServeFromStaleCacheFetchVerify(t *testing.T) {
} }
} }
func TestServeFromStaleCacheFetchVerifyTimeout(t *testing.T) {
// Verify that when verifyStaleTimeout is set and the upstream is slow,
// the client gets the stale entry within ~timeout, while the in-flight
// verify continues in the background and refreshes the cache.
c := New()
c.staleUpTo = 1 * time.Hour
c.verifyStale = true
c.verifyStaleTimeout = 50 * time.Millisecond
c.Next = ttlBackend(120)
req := new(dns.Msg)
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
// Prime the cache with a 120s TTL entry.
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.ServeDNS(ctx, rec, req)
if c.pcache.Len() != 1 {
t.Fatalf("Msg with > 0 TTL should have been cached")
}
// Move forward past the TTL so the entry is stale.
c.now = func() time.Time { return time.Now().Add(3 * time.Minute) }
// Swap in a slow backend that takes longer than the verify timeout.
bgDone := make(chan struct{})
c.Next = slowTTLBackend(60, 200*time.Millisecond, bgDone)
rec = dnstest.NewRecorder(&test.ResponseWriter{})
start := time.Now()
ret, err := c.ServeDNS(ctx, rec, req.Copy())
elapsed := time.Since(start)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ret != dns.RcodeSuccess {
t.Fatalf("expected RcodeSuccess, got %d", ret)
}
if elapsed > 150*time.Millisecond {
t.Errorf("expected response within ~timeout (50ms); took %v", elapsed)
}
if rec.Msg == nil || len(rec.Msg.Answer) == 0 {
t.Fatalf("expected an answer, got %+v", rec.Msg)
}
// Stale serve sets TTL to 0.
if got := rec.Msg.Answer[0].Header().Ttl; got != 0 {
t.Errorf("expected stale TTL=0, got %d", got)
}
// Wait for the background verify to complete.
select {
case <-bgDone:
case <-time.After(2 * time.Second):
t.Fatalf("background verify never completed")
}
}
func TestServeFromStaleCacheFetchVerifyTimeoutFastUpstream(t *testing.T) {
// When the upstream answers within the verify timeout, the client should
// receive the freshly verified response (not a stale one).
c := New()
c.staleUpTo = 1 * time.Hour
c.verifyStale = true
c.verifyStaleTimeout = 500 * time.Millisecond
c.Next = ttlBackend(120)
req := new(dns.Msg)
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.ServeDNS(ctx, rec, req)
if c.pcache.Len() != 1 {
t.Fatalf("Msg with > 0 TTL should have been cached")
}
c.now = func() time.Time { return time.Now().Add(3 * time.Minute) }
// Fast upstream returning fresh TTL=200.
c.Next = ttlBackend(200)
rec = dnstest.NewRecorder(&test.ResponseWriter{})
ret, err := c.ServeDNS(ctx, rec, req.Copy())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ret != dns.RcodeSuccess {
t.Fatalf("expected RcodeSuccess, got %d", ret)
}
if rec.Msg == nil || len(rec.Msg.Answer) == 0 {
t.Fatalf("expected an answer, got %+v", rec.Msg)
}
if got := rec.Msg.Answer[0].Header().Ttl; got != 200 {
t.Errorf("expected fresh TTL=200, got %d", got)
}
}
func TestNegativeStaleMaskingPositiveCache(t *testing.T) { func TestNegativeStaleMaskingPositiveCache(t *testing.T) {
c := New() c := New()
c.staleUpTo = time.Minute * 10 c.staleUpTo = time.Minute * 10
@@ -672,6 +769,28 @@ func servFailBackend(ttl int) plugin.Handler {
}) })
} }
// slowTTLBackend wraps ttlBackend with a fixed delay to simulate a slow upstream.
// done is closed once the response is written so callers can synchronise with the
// background goroutine.
func slowTTLBackend(ttl int, delay time.Duration, done chan<- struct{}) plugin.Handler {
return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
select {
case <-time.After(delay):
case <-ctx.Done():
return dns.RcodeServerFailure, ctx.Err()
}
m := new(dns.Msg)
m.SetReply(r)
m.Response, m.RecursionAvailable = true, true
m.Answer = []dns.RR{test.A(fmt.Sprintf("example.org. %d IN A 127.0.0.53", ttl))}
w.WriteMsg(m)
if done != nil {
close(done)
}
return dns.RcodeSuccess, nil
})
}
func TestComputeTTL(t *testing.T) { func TestComputeTTL(t *testing.T) {
tests := []struct { tests := []struct {
msgTTL time.Duration msgTTL time.Duration

View File

@@ -45,9 +45,18 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
// serve stale behavior // serve stale behavior
if c.verifyStale { if c.verifyStale {
crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do, cd: cd} crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do, cd: cd}
if c.verifyStaleTimeout > 0 {
// Background verify: cache the response but do not write to the wire.
// On timeout, we serve the stale entry below and let the goroutine continue.
crr.prefetch = true
}
cw := newVerifyStaleResponseWriter(crr) cw := newVerifyStaleResponseWriter(crr)
ret, err := c.doRefresh(ctx, state, cw) if c.verifyStaleTimeout == 0 {
if cw.refreshed { ret, err := c.doRefresh(ctx, state, cw)
if cw.refreshed {
return ret, err
}
} else if served, ret, err := c.verifyWithTimeout(ctx, state, w, cw, r, do, ad); served {
return ret, err return ret, err
} }
} }
@@ -121,6 +130,48 @@ func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.Res
return plugin.NextOrFailure(c.Name(), c.Next, ctx, cw, state.Req) return plugin.NextOrFailure(c.Name(), c.Next, ctx, cw, state.Req)
} }
// verifyWithTimeout runs the upstream verify in a background goroutine and races it
// against verifyStaleTimeout. If the verify completes within the timeout and the
// response is cacheable (NoError or NXDomain), the freshly cached entry is served
// to the client and served is true. Otherwise served is false and the caller falls
// through to serve stale; the goroutine continues to run and any successful response
// will update the cache without writing to the (now-detached) client connection.
func (c *Cache) verifyWithTimeout(ctx context.Context, state request.Request, w dns.ResponseWriter, cw *verifyStaleResponseWriter, r *dns.Msg, do, ad bool) (served bool, code int, err error) {
type result struct {
code int
err error
}
done := make(chan result, 1)
go func() {
rc, re := c.doRefresh(ctx, state, cw)
done <- result{rc, re}
}()
timer := time.NewTimer(c.verifyStaleTimeout)
defer timer.Stop()
select {
case res := <-done:
if !cw.refreshed {
return false, 0, nil
}
fresh := c.exists(state.Name(), state.QType(), state.Do(), state.Req.CheckingDisabled)
if fresh == nil {
// Should not happen: refreshed=true means the upstream response was cacheable.
return true, res.code, res.err
}
now := c.now().UTC()
if c.keepttl {
now = fresh.stored
}
resp := fresh.toMsg(r, now, do, ad)
if err := w.WriteMsg(resp); err != nil {
return true, dns.RcodeServerFailure, err
}
return true, dns.RcodeSuccess, nil
case <-timer.C:
return false, 0, nil
}
}
func (c *Cache) shouldPrefetch(i *item, now time.Time) bool { func (c *Cache) shouldPrefetch(i *item, now time.Time) bool {
if c.prefetch <= 0 { if c.prefetch <= 0 {
return false return false

16
plugin/cache/setup.go vendored
View File

@@ -172,7 +172,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
case "serve_stale": case "serve_stale":
args := c.RemainingArgs() args := c.RemainingArgs()
if len(args) > 2 { if len(args) > 3 {
return nil, c.ArgErr() return nil, c.ArgErr()
} }
ca.staleUpTo = 1 * time.Hour ca.staleUpTo = 1 * time.Hour
@@ -187,6 +187,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
ca.staleUpTo = d ca.staleUpTo = d
} }
ca.verifyStale = false ca.verifyStale = false
ca.verifyStaleTimeout = 0
if len(args) > 1 { if len(args) > 1 {
mode := strings.ToLower(args[1]) mode := strings.ToLower(args[1])
if mode != "immediate" && mode != "verify" { if mode != "immediate" && mode != "verify" {
@@ -194,6 +195,19 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
} }
ca.verifyStale = mode == "verify" ca.verifyStale = mode == "verify"
} }
if len(args) > 2 {
if !ca.verifyStale {
return nil, errors.New("serve_stale timeout is only valid with the verify refresh mode")
}
t, err := time.ParseDuration(args[2])
if err != nil {
return nil, fmt.Errorf("invalid serve_stale verify timeout: %w", err)
}
if t < 0 {
return nil, errors.New("invalid negative timeout for serve_stale verify")
}
ca.verifyStaleTimeout = t
}
case "servfail": case "servfail":
args := c.RemainingArgs() args := c.RemainingArgs()
if len(args) != 1 { if len(args) != 1 {

View File

@@ -117,25 +117,33 @@ func TestSetup(t *testing.T) {
func TestServeStale(t *testing.T) { func TestServeStale(t *testing.T) {
tests := []struct { tests := []struct {
input string input string
shouldErr bool shouldErr bool
staleUpTo time.Duration staleUpTo time.Duration
verifyStale bool verifyStale bool
verifyStaleTimeout time.Duration
}{ }{
{"serve_stale", false, 1 * time.Hour, false}, {"serve_stale", false, 1 * time.Hour, false, 0},
{"serve_stale 20m", false, 20 * time.Minute, false}, {"serve_stale 20m", false, 20 * time.Minute, false, 0},
{"serve_stale 1h20m", false, 80 * time.Minute, false}, {"serve_stale 1h20m", false, 80 * time.Minute, false, 0},
{"serve_stale 0m", false, 0, false}, {"serve_stale 0m", false, 0, false, 0},
{"serve_stale 0", false, 0, false}, {"serve_stale 0", false, 0, false, 0},
{"serve_stale 0 verify", false, 0, true}, {"serve_stale 0 verify", false, 0, true, 0},
{"serve_stale 0 immediate", false, 0, false}, {"serve_stale 0 immediate", false, 0, false, 0},
{"serve_stale 0 VERIFY", false, 0, true}, {"serve_stale 0 VERIFY", false, 0, true, 0},
{"serve_stale 1h verify 100ms", false, 1 * time.Hour, true, 100 * time.Millisecond},
{"serve_stale 1h verify 0", false, 1 * time.Hour, true, 0},
{"serve_stale 1h VERIFY 250ms", false, 1 * time.Hour, true, 250 * time.Millisecond},
// fails // fails
{"serve_stale 20", true, 0, false}, {"serve_stale 20", true, 0, false, 0},
{"serve_stale -20m", true, 0, false}, {"serve_stale -20m", true, 0, false, 0},
{"serve_stale aa", true, 0, false}, {"serve_stale aa", true, 0, false, 0},
{"serve_stale 1m nono", true, 0, false}, {"serve_stale 1m nono", true, 0, false, 0},
{"serve_stale 0 after nono", true, 0, false}, {"serve_stale 0 after nono", true, 0, false, 0},
{"serve_stale 1h immediate 100ms", true, 0, false, 0},
{"serve_stale 1h verify -1ms", true, 0, false, 0},
{"serve_stale 1h verify garbage", true, 0, false, 0},
{"serve_stale 1h verify 100ms extra", true, 0, false, 0},
} }
for i, test := range tests { for i, test := range tests {
c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input)) c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))
@@ -153,6 +161,12 @@ func TestServeStale(t *testing.T) {
if ca.staleUpTo != test.staleUpTo { if ca.staleUpTo != test.staleUpTo {
t.Errorf("Test %v: Expected stale %v but found: %v", i, test.staleUpTo, ca.staleUpTo) t.Errorf("Test %v: Expected stale %v but found: %v", i, test.staleUpTo, ca.staleUpTo)
} }
if ca.verifyStale != test.verifyStale {
t.Errorf("Test %v: Expected verifyStale %v but found: %v", i, test.verifyStale, ca.verifyStale)
}
if ca.verifyStaleTimeout != test.verifyStaleTimeout {
t.Errorf("Test %v: Expected verifyStaleTimeout %v but found: %v", i, test.verifyStaleTimeout, ca.verifyStaleTimeout)
}
} }
} }