feat(cache): add optional verify timeout to serve_stale (#8070)

This commit is contained in:
Syed Azeez
2026-05-06 13:02:28 +05:30
committed by GitHub
parent 145029c847
commit b2cb44b966
6 changed files with 226 additions and 23 deletions

View File

@@ -40,7 +40,7 @@ cache [TTL] [ZONES...] {
success CAPACITY [TTL] [MINTTL]
denial CAPACITY [TTL] [MINTTL]
prefetch AMOUNT [[DURATION] [PERCENTAGE%]]
serve_stale [DURATION] [REFRESH_MODE]
serve_stale [DURATION] [REFRESH_MODE [VERIFY_TIMEOUT]]
servfail DURATION
disable success|denial [ZONES...]
keepttl
@@ -74,6 +74,10 @@ cache [TTL] [ZONES...] {
from ever being served if an updated response can be retrieved from the source.
In `immediate` mode, concurrent requests for the same expired entry dispatch at most one
background refresh.
**VERIFY_TIMEOUT** is only valid with `verify` and bounds how long the cache waits for the upstream
verify before falling back to the stale entry. The verify continues in the background and refreshes the
cache when it eventually succeeds, so subsequent queries see the fresh entry. The default of `0` means
wait until the upstream's own timeout (the original `verify` behavior). Example: `serve_stale 1h verify 100ms`.
* `servfail` cache SERVFAIL responses for **DURATION**. Setting **DURATION** to 0 will disable caching of SERVFAIL
responses. If this option is not set, SERVFAIL responses will be cached for 5 seconds. **DURATION** may not be
greater than 5 minutes.

View File

@@ -42,8 +42,9 @@ type Cache struct {
percentage int
// Stale serve
staleUpTo time.Duration
verifyStale bool
staleUpTo time.Duration
verifyStale bool
verifyStaleTimeout time.Duration // 0 means wait for upstream until its own timeout (current default).
// Positive/negative zone exceptions
pexcept []string

View File

@@ -501,6 +501,103 @@ func TestServeFromStaleCacheFetchVerify(t *testing.T) {
}
}
func TestServeFromStaleCacheFetchVerifyTimeout(t *testing.T) {
// Verify that when verifyStaleTimeout is set and the upstream is slow,
// the client gets the stale entry within ~timeout, while the in-flight
// verify continues in the background and refreshes the cache.
c := New()
c.staleUpTo = 1 * time.Hour
c.verifyStale = true
c.verifyStaleTimeout = 50 * time.Millisecond
c.Next = ttlBackend(120)
req := new(dns.Msg)
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
// Prime the cache with a 120s TTL entry.
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.ServeDNS(ctx, rec, req)
if c.pcache.Len() != 1 {
t.Fatalf("Msg with > 0 TTL should have been cached")
}
// Move forward past the TTL so the entry is stale.
c.now = func() time.Time { return time.Now().Add(3 * time.Minute) }
// Swap in a slow backend that takes longer than the verify timeout.
bgDone := make(chan struct{})
c.Next = slowTTLBackend(60, 200*time.Millisecond, bgDone)
rec = dnstest.NewRecorder(&test.ResponseWriter{})
start := time.Now()
ret, err := c.ServeDNS(ctx, rec, req.Copy())
elapsed := time.Since(start)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ret != dns.RcodeSuccess {
t.Fatalf("expected RcodeSuccess, got %d", ret)
}
if elapsed > 150*time.Millisecond {
t.Errorf("expected response within ~timeout (50ms); took %v", elapsed)
}
if rec.Msg == nil || len(rec.Msg.Answer) == 0 {
t.Fatalf("expected an answer, got %+v", rec.Msg)
}
// Stale serve sets TTL to 0.
if got := rec.Msg.Answer[0].Header().Ttl; got != 0 {
t.Errorf("expected stale TTL=0, got %d", got)
}
// Wait for the background verify to complete.
select {
case <-bgDone:
case <-time.After(2 * time.Second):
t.Fatalf("background verify never completed")
}
}
func TestServeFromStaleCacheFetchVerifyTimeoutFastUpstream(t *testing.T) {
// When the upstream answers within the verify timeout, the client should
// receive the freshly verified response (not a stale one).
c := New()
c.staleUpTo = 1 * time.Hour
c.verifyStale = true
c.verifyStaleTimeout = 500 * time.Millisecond
c.Next = ttlBackend(120)
req := new(dns.Msg)
req.SetQuestion("cached.org.", dns.TypeA)
ctx := context.TODO()
rec := dnstest.NewRecorder(&test.ResponseWriter{})
c.ServeDNS(ctx, rec, req)
if c.pcache.Len() != 1 {
t.Fatalf("Msg with > 0 TTL should have been cached")
}
c.now = func() time.Time { return time.Now().Add(3 * time.Minute) }
// Fast upstream returning fresh TTL=200.
c.Next = ttlBackend(200)
rec = dnstest.NewRecorder(&test.ResponseWriter{})
ret, err := c.ServeDNS(ctx, rec, req.Copy())
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if ret != dns.RcodeSuccess {
t.Fatalf("expected RcodeSuccess, got %d", ret)
}
if rec.Msg == nil || len(rec.Msg.Answer) == 0 {
t.Fatalf("expected an answer, got %+v", rec.Msg)
}
if got := rec.Msg.Answer[0].Header().Ttl; got != 200 {
t.Errorf("expected fresh TTL=200, got %d", got)
}
}
func TestNegativeStaleMaskingPositiveCache(t *testing.T) {
c := New()
c.staleUpTo = time.Minute * 10
@@ -672,6 +769,28 @@ func servFailBackend(ttl int) plugin.Handler {
})
}
// slowTTLBackend wraps ttlBackend with a fixed delay to simulate a slow upstream.
// done is closed once the response is written so callers can synchronise with the
// background goroutine.
func slowTTLBackend(ttl int, delay time.Duration, done chan<- struct{}) plugin.Handler {
return plugin.HandlerFunc(func(ctx context.Context, w dns.ResponseWriter, r *dns.Msg) (int, error) {
select {
case <-time.After(delay):
case <-ctx.Done():
return dns.RcodeServerFailure, ctx.Err()
}
m := new(dns.Msg)
m.SetReply(r)
m.Response, m.RecursionAvailable = true, true
m.Answer = []dns.RR{test.A(fmt.Sprintf("example.org. %d IN A 127.0.0.53", ttl))}
w.WriteMsg(m)
if done != nil {
close(done)
}
return dns.RcodeSuccess, nil
})
}
func TestComputeTTL(t *testing.T) {
tests := []struct {
msgTTL time.Duration

View File

@@ -45,9 +45,18 @@ func (c *Cache) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg)
// serve stale behavior
if c.verifyStale {
crr := &ResponseWriter{ResponseWriter: w, Cache: c, state: state, server: server, do: do, cd: cd}
if c.verifyStaleTimeout > 0 {
// Background verify: cache the response but do not write to the wire.
// On timeout, we serve the stale entry below and let the goroutine continue.
crr.prefetch = true
}
cw := newVerifyStaleResponseWriter(crr)
ret, err := c.doRefresh(ctx, state, cw)
if cw.refreshed {
if c.verifyStaleTimeout == 0 {
ret, err := c.doRefresh(ctx, state, cw)
if cw.refreshed {
return ret, err
}
} else if served, ret, err := c.verifyWithTimeout(ctx, state, w, cw, r, do, ad); served {
return ret, err
}
}
@@ -121,6 +130,48 @@ func (c *Cache) doRefresh(ctx context.Context, state request.Request, cw dns.Res
return plugin.NextOrFailure(c.Name(), c.Next, ctx, cw, state.Req)
}
// verifyWithTimeout runs the upstream verify in a background goroutine and races it
// against verifyStaleTimeout. If the verify completes within the timeout and the
// response is cacheable (NoError or NXDomain), the freshly cached entry is served
// to the client and served is true. Otherwise served is false and the caller falls
// through to serve stale; the goroutine continues to run and any successful response
// will update the cache without writing to the (now-detached) client connection.
func (c *Cache) verifyWithTimeout(ctx context.Context, state request.Request, w dns.ResponseWriter, cw *verifyStaleResponseWriter, r *dns.Msg, do, ad bool) (served bool, code int, err error) {
type result struct {
code int
err error
}
done := make(chan result, 1)
go func() {
rc, re := c.doRefresh(ctx, state, cw)
done <- result{rc, re}
}()
timer := time.NewTimer(c.verifyStaleTimeout)
defer timer.Stop()
select {
case res := <-done:
if !cw.refreshed {
return false, 0, nil
}
fresh := c.exists(state.Name(), state.QType(), state.Do(), state.Req.CheckingDisabled)
if fresh == nil {
// Should not happen: refreshed=true means the upstream response was cacheable.
return true, res.code, res.err
}
now := c.now().UTC()
if c.keepttl {
now = fresh.stored
}
resp := fresh.toMsg(r, now, do, ad)
if err := w.WriteMsg(resp); err != nil {
return true, dns.RcodeServerFailure, err
}
return true, dns.RcodeSuccess, nil
case <-timer.C:
return false, 0, nil
}
}
func (c *Cache) shouldPrefetch(i *item, now time.Time) bool {
if c.prefetch <= 0 {
return false

16
plugin/cache/setup.go vendored
View File

@@ -172,7 +172,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
case "serve_stale":
args := c.RemainingArgs()
if len(args) > 2 {
if len(args) > 3 {
return nil, c.ArgErr()
}
ca.staleUpTo = 1 * time.Hour
@@ -187,6 +187,7 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
ca.staleUpTo = d
}
ca.verifyStale = false
ca.verifyStaleTimeout = 0
if len(args) > 1 {
mode := strings.ToLower(args[1])
if mode != "immediate" && mode != "verify" {
@@ -194,6 +195,19 @@ func cacheParse(c *caddy.Controller) (*Cache, error) {
}
ca.verifyStale = mode == "verify"
}
if len(args) > 2 {
if !ca.verifyStale {
return nil, errors.New("serve_stale timeout is only valid with the verify refresh mode")
}
t, err := time.ParseDuration(args[2])
if err != nil {
return nil, fmt.Errorf("invalid serve_stale verify timeout: %w", err)
}
if t < 0 {
return nil, errors.New("invalid negative timeout for serve_stale verify")
}
ca.verifyStaleTimeout = t
}
case "servfail":
args := c.RemainingArgs()
if len(args) != 1 {

View File

@@ -117,25 +117,33 @@ func TestSetup(t *testing.T) {
func TestServeStale(t *testing.T) {
tests := []struct {
input string
shouldErr bool
staleUpTo time.Duration
verifyStale bool
input string
shouldErr bool
staleUpTo time.Duration
verifyStale bool
verifyStaleTimeout time.Duration
}{
{"serve_stale", false, 1 * time.Hour, false},
{"serve_stale 20m", false, 20 * time.Minute, false},
{"serve_stale 1h20m", false, 80 * time.Minute, false},
{"serve_stale 0m", false, 0, false},
{"serve_stale 0", false, 0, false},
{"serve_stale 0 verify", false, 0, true},
{"serve_stale 0 immediate", false, 0, false},
{"serve_stale 0 VERIFY", false, 0, true},
{"serve_stale", false, 1 * time.Hour, false, 0},
{"serve_stale 20m", false, 20 * time.Minute, false, 0},
{"serve_stale 1h20m", false, 80 * time.Minute, false, 0},
{"serve_stale 0m", false, 0, false, 0},
{"serve_stale 0", false, 0, false, 0},
{"serve_stale 0 verify", false, 0, true, 0},
{"serve_stale 0 immediate", false, 0, false, 0},
{"serve_stale 0 VERIFY", false, 0, true, 0},
{"serve_stale 1h verify 100ms", false, 1 * time.Hour, true, 100 * time.Millisecond},
{"serve_stale 1h verify 0", false, 1 * time.Hour, true, 0},
{"serve_stale 1h VERIFY 250ms", false, 1 * time.Hour, true, 250 * time.Millisecond},
// fails
{"serve_stale 20", true, 0, false},
{"serve_stale -20m", true, 0, false},
{"serve_stale aa", true, 0, false},
{"serve_stale 1m nono", true, 0, false},
{"serve_stale 0 after nono", true, 0, false},
{"serve_stale 20", true, 0, false, 0},
{"serve_stale -20m", true, 0, false, 0},
{"serve_stale aa", true, 0, false, 0},
{"serve_stale 1m nono", true, 0, false, 0},
{"serve_stale 0 after nono", true, 0, false, 0},
{"serve_stale 1h immediate 100ms", true, 0, false, 0},
{"serve_stale 1h verify -1ms", true, 0, false, 0},
{"serve_stale 1h verify garbage", true, 0, false, 0},
{"serve_stale 1h verify 100ms extra", true, 0, false, 0},
}
for i, test := range tests {
c := caddy.NewTestController("dns", fmt.Sprintf("cache {\n%s\n}", test.input))
@@ -153,6 +161,12 @@ func TestServeStale(t *testing.T) {
if ca.staleUpTo != test.staleUpTo {
t.Errorf("Test %v: Expected stale %v but found: %v", i, test.staleUpTo, ca.staleUpTo)
}
if ca.verifyStale != test.verifyStale {
t.Errorf("Test %v: Expected verifyStale %v but found: %v", i, test.verifyStale, ca.verifyStale)
}
if ca.verifyStaleTimeout != test.verifyStaleTimeout {
t.Errorf("Test %v: Expected verifyStaleTimeout %v but found: %v", i, test.verifyStaleTimeout, ca.verifyStaleTimeout)
}
}
}