mirror of
https://github.com/coredns/coredns.git
synced 2026-01-17 14:21:18 -05:00
perf(proxy): use mutex-based connection pool (#7790)
* perf(proxy): use mutex-based connection pool The proxy package (used for example by the forward plugin) utilized an actor model where a single connManager goroutine managed connection pooling via unbuffered channels (dial, yield, ret). This design serialized all connection acquisition and release operations through a single goroutine, creating a bottleneck under high concurrency. This was observable as a performance degradation when using a single upstream backend compared to multiple backends (which sharded the bottleneck). Changes: - Removed dial, yield, and ret channels from the Transport struct. - Removed the connManager goroutine's request processing loop. - Implemented Dial() and Yield() using a sync.Mutex to protect the connection slice, allowing for fast concurrent access without context switching. - Downgraded connManager to a simple background cleanup loop that only handles connection expiration on a ticker. - Updated plugin/pkg/proxy/connect.go to use direct method calls instead of channel sends. - Updated tests to reflect the removal of internal channels. Benchmarks show that this change eliminates the single-backend bottleneck. Now a single upstream backend performs on par with multiple backends, and overall throughput is improved. The implementation aligns with standard Go patterns for connection pooling (e.g., net/http.Transport). Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: address PR review for persistent.go - Named mutex field instead of embedding, to not expose Lock() and Unlock() - Move stop check outside of lock in Yield() - Close() without a separate goroutine - Change stop channel to struct Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: address code review feedback for conn pool - Switch from LIFO to FIFO connection selection for source port diversity, reducing DNS cache poisoning risk (RFC 5452). - Remove "clear entire cache" optimization as it was LIFO-specific. FIFO naturally iterates and skips expired connections. - Remove all goroutines for closing connections; collect connections while holding lock, close synchronously after releasing lock. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * fix: remove unused error consts No longer utilised after refactoring the channel based approach. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * feat(forward): add max_idle_conns option Add configurable connection pool limit for the forward plugin via the max_idle_conns Corefile option. Changes: - Add SetMaxIdleConns to proxy - Add maxIdleConns field to Forward struct - Add max_idle_conns parsing in forward plugin setup - Apply setting to each proxy during configuration - Update forward plugin README with new option By default the value is 0 (unbounded). When set, excess connections returned to the pool are closed immediately rather than cached. Also add a yield related test. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * chore(proxy): simple Dial by closing conns inline Remove toClose slice collection to reduce complexity. Instead close expired connections directly while iterating. Reduces complexity with negligible lock-time impact. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> * chore: fewer explicit Unlock calls Cleaner and less chance of forgetting to unlock on new possible code paths. Signed-off-by: Ville Vesilehto <ville@vesilehto.fi> --------- Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
This commit is contained in:
@@ -3,6 +3,7 @@ package proxy
|
||||
import (
|
||||
"crypto/tls"
|
||||
"sort"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
@@ -16,17 +17,16 @@ type persistConn struct {
|
||||
|
||||
// Transport hold the persistent cache.
|
||||
type Transport struct {
|
||||
avgDialTime int64 // kind of average time of dial time
|
||||
conns [typeTotalCount][]*persistConn // Buckets for udp, tcp and tcp-tls.
|
||||
expire time.Duration // After this duration a connection is expired.
|
||||
addr string
|
||||
tlsConfig *tls.Config
|
||||
proxyName string
|
||||
avgDialTime int64 // kind of average time of dial time
|
||||
conns [typeTotalCount][]*persistConn // Buckets for udp, tcp and tcp-tls.
|
||||
expire time.Duration // After this duration a connection is expired.
|
||||
maxIdleConns int // Max idle connections per transport type; 0 means unlimited.
|
||||
addr string
|
||||
tlsConfig *tls.Config
|
||||
proxyName string
|
||||
|
||||
dial chan string
|
||||
yield chan *persistConn
|
||||
ret chan *persistConn
|
||||
stop chan bool
|
||||
mu sync.Mutex
|
||||
stop chan struct{}
|
||||
}
|
||||
|
||||
func newTransport(proxyName, addr string) *Transport {
|
||||
@@ -35,10 +35,7 @@ func newTransport(proxyName, addr string) *Transport {
|
||||
conns: [typeTotalCount][]*persistConn{},
|
||||
expire: defaultExpire,
|
||||
addr: addr,
|
||||
dial: make(chan string),
|
||||
yield: make(chan *persistConn),
|
||||
ret: make(chan *persistConn),
|
||||
stop: make(chan bool),
|
||||
stop: make(chan struct{}),
|
||||
proxyName: proxyName,
|
||||
}
|
||||
return t
|
||||
@@ -48,38 +45,12 @@ func newTransport(proxyName, addr string) *Transport {
|
||||
func (t *Transport) connManager() {
|
||||
ticker := time.NewTicker(defaultExpire)
|
||||
defer ticker.Stop()
|
||||
Wait:
|
||||
for {
|
||||
select {
|
||||
case proto := <-t.dial:
|
||||
transtype := stringToTransportType(proto)
|
||||
// take the last used conn - complexity O(1)
|
||||
if stack := t.conns[transtype]; len(stack) > 0 {
|
||||
pc := stack[len(stack)-1]
|
||||
if time.Since(pc.used) < t.expire {
|
||||
// Found one, remove from pool and return this conn.
|
||||
t.conns[transtype] = stack[:len(stack)-1]
|
||||
t.ret <- pc
|
||||
continue Wait
|
||||
}
|
||||
// clear entire cache if the last conn is expired
|
||||
t.conns[transtype] = nil
|
||||
// now, the connections being passed to closeConns() are not reachable from
|
||||
// transport methods anymore. So, it's safe to close them in a separate goroutine
|
||||
go closeConns(stack)
|
||||
}
|
||||
t.ret <- nil
|
||||
|
||||
case pc := <-t.yield:
|
||||
transtype := t.transportTypeFromConn(pc)
|
||||
t.conns[transtype] = append(t.conns[transtype], pc)
|
||||
|
||||
case <-ticker.C:
|
||||
t.cleanup(false)
|
||||
|
||||
case <-t.stop:
|
||||
t.cleanup(true)
|
||||
close(t.ret)
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -94,6 +65,9 @@ func closeConns(conns []*persistConn) {
|
||||
|
||||
// cleanup removes connections from cache.
|
||||
func (t *Transport) cleanup(all bool) {
|
||||
var toClose []*persistConn
|
||||
|
||||
t.mu.Lock()
|
||||
staleTime := time.Now().Add(-t.expire)
|
||||
for transtype, stack := range t.conns {
|
||||
if len(stack) == 0 {
|
||||
@@ -101,9 +75,7 @@ func (t *Transport) cleanup(all bool) {
|
||||
}
|
||||
if all {
|
||||
t.conns[transtype] = nil
|
||||
// now, the connections being passed to closeConns() are not reachable from
|
||||
// transport methods anymore. So, it's safe to close them in a separate goroutine
|
||||
go closeConns(stack)
|
||||
toClose = append(toClose, stack...)
|
||||
continue
|
||||
}
|
||||
if stack[0].used.After(staleTime) {
|
||||
@@ -115,34 +87,38 @@ func (t *Transport) cleanup(all bool) {
|
||||
return stack[i].used.After(staleTime)
|
||||
})
|
||||
t.conns[transtype] = stack[good:]
|
||||
// now, the connections being passed to closeConns() are not reachable from
|
||||
// transport methods anymore. So, it's safe to close them in a separate goroutine
|
||||
go closeConns(stack[:good])
|
||||
toClose = append(toClose, stack[:good]...)
|
||||
}
|
||||
}
|
||||
t.mu.Unlock()
|
||||
|
||||
// It is hard to pin a value to this, the import thing is to no block forever, losing at cached connection is not terrible.
|
||||
const yieldTimeout = 25 * time.Millisecond
|
||||
// Close connections after releasing lock
|
||||
closeConns(toClose)
|
||||
}
|
||||
|
||||
// Yield returns the connection to transport for reuse.
|
||||
func (t *Transport) Yield(pc *persistConn) {
|
||||
pc.used = time.Now() // update used time
|
||||
|
||||
// Optimization: Try to return the connection immediately without creating a timer.
|
||||
// If the receiver is not ready, we fall back to a timeout-based send to avoid blocking forever.
|
||||
// Returning the connection is just an optimization, so dropping it on timeout is fine.
|
||||
// Check if transport is stopped before acquiring lock
|
||||
select {
|
||||
case t.yield <- pc:
|
||||
case <-t.stop:
|
||||
// If stopped, don't return to pool, just close
|
||||
pc.c.Close()
|
||||
return
|
||||
default:
|
||||
}
|
||||
|
||||
select {
|
||||
case t.yield <- pc:
|
||||
return
|
||||
case <-time.After(yieldTimeout):
|
||||
pc.used = time.Now() // update used time
|
||||
|
||||
t.mu.Lock()
|
||||
defer t.mu.Unlock()
|
||||
|
||||
transtype := t.transportTypeFromConn(pc)
|
||||
|
||||
if t.maxIdleConns > 0 && len(t.conns[transtype]) >= t.maxIdleConns {
|
||||
pc.c.Close()
|
||||
return
|
||||
}
|
||||
|
||||
t.conns[transtype] = append(t.conns[transtype], pc)
|
||||
}
|
||||
|
||||
// Start starts the transport's connection manager.
|
||||
@@ -154,6 +130,10 @@ func (t *Transport) Stop() { close(t.stop) }
|
||||
// SetExpire sets the connection expire time in transport.
|
||||
func (t *Transport) SetExpire(expire time.Duration) { t.expire = expire }
|
||||
|
||||
// SetMaxIdleConns sets the maximum idle connections per transport type.
|
||||
// A value of 0 means unlimited (default).
|
||||
func (t *Transport) SetMaxIdleConns(n int) { t.maxIdleConns = n }
|
||||
|
||||
// SetTLSConfig sets the TLS config in transport.
|
||||
func (t *Transport) SetTLSConfig(cfg *tls.Config) { t.tlsConfig = cfg }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user