From b1a7fc8db1a32c7c02bb1f584e608d3d9f7feb7a Mon Sep 17 00:00:00 2001 From: Umut Polat <52835619+umut-polat@users.noreply.github.com> Date: Wed, 20 May 2026 06:28:35 +0300 Subject: [PATCH] fix(cache): prefer positive cache over SERVFAIL in ncache (#8003) When serve_stale is enabled, a cached SERVFAIL in ncache shadows a valid positive entry in pcache because ncache is always checked first. SERVFAIL is transient and should not mask a known-good answer. When the ncache hit is a SERVFAIL, check pcache for a valid entry before returning the SERVFAIL. NXDOMAIN and NODATA are unaffected and still follow the existing ncache-first lookup per RFC 2308. Fixes #7956 Signed-off-by: umut-polat <52835619+umut-polat@users.noreply.github.com> --- plugin/cache/cache_test.go | 39 ++++++++++++++++++++++++++++++++++++++ plugin/cache/handler.go | 11 +++++++++++ 2 files changed, 50 insertions(+) diff --git a/plugin/cache/cache_test.go b/plugin/cache/cache_test.go index 22af618ec..811884495 100644 --- a/plugin/cache/cache_test.go +++ b/plugin/cache/cache_test.go @@ -1027,3 +1027,42 @@ func wildcardMetadataBackend(qname, wildcard string) plugin.Handler { return dns.RcodeSuccess, nil }) } + +func TestServfailDoesNotShadowPositiveCache(t *testing.T) { + c := New() + c.staleUpTo = time.Hour // enable serve_stale + c.failttl = 5 * time.Second + now := time.Now() + c.now = func() time.Time { return now } + + // Manually insert a positive entry in pcache (stored 30s ago, TTL 120s -> still valid). + posMsg := new(dns.Msg) + posMsg.SetQuestion("example.org.", dns.TypeA) + posMsg.Response = true + posMsg.Answer = []dns.RR{test.A("example.org. 120 IN A 127.0.0.53")} + posItem := newItem(posMsg, now.Add(-30*time.Second), 120*time.Second) + k := hash("example.org.", dns.TypeA, false, false) + c.pcache.Add(k, posItem) + + // Manually insert a SERVFAIL entry in ncache (stored just now, TTL 5s). + failMsg := new(dns.Msg) + failMsg.SetQuestion("example.org.", dns.TypeA) + failMsg.Response = true + failMsg.Rcode = dns.RcodeServerFailure + failMsg.Ns = []dns.RR{test.SOA("example.org. 5 IN SOA sns.dns.icann.org. noc.dns.icann.org. 2016082540 7200 3600 1209600 3600")} + failItem := newItem(failMsg, now, 5*time.Second) + c.ncache.Add(k, failItem) + + // Lookup should prefer the positive entry over the SERVFAIL. + req := new(dns.Msg) + req.SetQuestion("example.org.", dns.TypeA) + state := request.Request{W: &test.ResponseWriter{}, Req: req} + + got := c.getIfNotStale(now.Add(time.Second), state, "test") + if got == nil { + t.Fatal("expected a cached item, got nil") + } + if got.Rcode != dns.RcodeSuccess { + t.Fatalf("expected positive cache entry (rcode 0), got rcode %d", got.Rcode) + } +} diff --git a/plugin/cache/handler.go b/plugin/cache/handler.go index 5f561118d..cc0b20bb7 100644 --- a/plugin/cache/handler.go +++ b/plugin/cache/handler.go @@ -192,6 +192,17 @@ func (c *Cache) getIfNotStale(now time.Time, state request.Request, server strin if i, ok := c.ncache.Get(k); ok { ttl := i.ttl(now) if i.matches(state) && (ttl > 0 || (c.staleUpTo > 0 && -ttl < int(c.staleUpTo.Seconds()))) { + // SERVFAIL is transient; prefer a valid positive cache entry if one + // exists, so a cached SERVFAIL does not shadow a previously good answer. + if i.Rcode == dns.RcodeServerFailure { + if p, pok := c.pcache.Get(k); pok { + pttl := p.ttl(now) + if p.matches(state) && (pttl > 0 || (c.staleUpTo > 0 && -pttl < int(c.staleUpTo.Seconds()))) { + cacheHits.WithLabelValues(server, Success, c.zonesMetricLabel, c.viewMetricLabel).Inc() + return p + } + } + } cacheHits.WithLabelValues(server, Denial, c.zonesMetricLabel, c.viewMetricLabel).Inc() return i }