Skip to content

Commit f62e19a

Browse files
authored
fix(dns): add a lower-bound negative TTL (#4449)
see linkerd/linkerd2#14954. some user reports describe situations in which, when the linkerd control plane's destination controller is OOM-killed, DNS resolution can momentarily cause the proxy to compute a negative-TTL duration of zero. this causes a panic in production environments, because `tokio::time::interval` asserts that it has not been provided a duration of zero. this manifests in errors that look like this: ``` thread 'main' panicked at linkerd/app/core/src/control.rs:118:49: period must be non-zero. ``` this commit introduces a minimum negative TTL that will be used when backing off from resolution errors. this commit moves the logic responsible for enforcing a minimum TTL (added #3807) from the outer layer in `linkerd-dns-resolve` and into the `linkerd-dns` library. this helps us reuse/consolidate the same logic for negative TTL's when recovering from resolution errors, and will facilitate making this configurable in the future, should we so desire. X-Ref: #3807 Signed-off-by: katelyn martin <kate@buoyant.io>
1 parent afb7b79 commit f62e19a

3 files changed

Lines changed: 78 additions & 46 deletions

File tree

linkerd/dns/src/lib.rs

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ use thiserror::Error;
1111
use tokio::time::{self, Instant};
1212
use tracing::{debug, trace};
1313

14+
pub mod minimum_ttl;
15+
1416
#[derive(Clone)]
1517
pub struct Resolver {
1618
dns: TokioResolver,
@@ -201,32 +203,40 @@ impl fmt::Debug for Resolver {
201203
// === impl ResolveError ===
202204

203205
impl ResolveError {
204-
/// Returns the amount of time that the resolver should wait before
205-
/// retrying.
206+
/// Returns the amount of time that the resolver should wait before retrying.
206207
pub fn negative_ttl(&self) -> Option<time::Duration> {
207-
if let Some(hickory_resolver::proto::ProtoErrorKind::NoRecordsFound {
208-
negative_ttl: Some(ttl_secs),
209-
..
210-
}) = self
211-
.a_error
212-
.0
213-
.proto()
214-
.map(hickory_resolver::proto::ProtoError::kind)
215-
{
216-
return Some(time::Duration::from_secs(*ttl_secs as u64));
208+
let Self {
209+
a_error: ARecordError(a_error),
210+
srv_error,
211+
} = self;
212+
213+
if let ttl @ Some(_) = Self::negative_ttl_of(a_error) {
214+
return ttl;
217215
}
218216

219-
if let SrvRecordError::Resolve(error) = &self.srv_error {
220-
if let Some(hickory_resolver::proto::ProtoErrorKind::NoRecordsFound {
221-
negative_ttl: Some(ttl_secs),
222-
..
223-
}) = error.proto().map(hickory_resolver::proto::ProtoError::kind)
224-
{
225-
return Some(time::Duration::from_secs(*ttl_secs as u64));
226-
}
217+
match srv_error {
218+
SrvRecordError::Resolve(srv_error) => Self::negative_ttl_of(srv_error),
219+
SrvRecordError::Invalid(_) => None,
227220
}
221+
}
222+
223+
/// Returns the negative TTL [`time::Duration`] of a [`hickory_resolver::ResolveError`].
224+
///
225+
/// This function will defensively enforce a minimum negative TTL.
226+
fn negative_ttl_of(error: &hickory_resolver::ResolveError) -> Option<time::Duration> {
227+
use hickory_resolver::proto::{ProtoError, ProtoErrorKind};
228228

229-
None
229+
let Some(ProtoErrorKind::NoRecordsFound {
230+
negative_ttl: Some(ttl_secs),
231+
..
232+
}) = error.proto().map(ProtoError::kind)
233+
else {
234+
return None;
235+
};
236+
237+
let ttl = time::Duration::from_secs(*ttl_secs as u64);
238+
let ttl = minimum_ttl::with_minimum_duration(ttl);
239+
Some(ttl)
230240
}
231241
}
232242

linkerd/dns/src/minimum_ttl.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
//! Minimum TTL enforcement.
2+
//!
3+
//! This module contains functions to enforce a lower-bound for TTL's (including negative TTL's
4+
//! used during error recovery) to prevent DNS resolution from spinning in a hot-loop.
5+
6+
use tokio::time::{Duration, Instant};
7+
use tracing::debug;
8+
9+
/// The minimum TTL duration that will be respected.
10+
const MINIMUM_TTL: Duration = Duration::from_secs(5);
11+
12+
/// Apply a lower-bound to the given [`Instant`].
13+
///
14+
/// NB: This enforces a lower-bound for TTL's to prevent DNS resolution from spinning in a
15+
/// hot-loop.
16+
#[tracing::instrument(level = "debug")]
17+
pub fn with_minimum_expiry(valid_until: Instant) -> Instant {
18+
let minimum = Instant::now() + MINIMUM_TTL;
19+
20+
// Choose a deadline; if the expiry is too short, fall back to the minimum TTL.
21+
let deadline = if valid_until >= minimum {
22+
valid_until
23+
} else {
24+
debug!(ttl.min = ?MINIMUM_TTL, "Given TTL too short, using a minimum TTL");
25+
minimum
26+
};
27+
28+
deadline
29+
}
30+
31+
/// Apply a lower-bound to the given [`Duration`].
32+
///
33+
/// NB: This enforces a lower-bound for negative TTL's to prevent DNS resolution error recovery
34+
/// from spinning in a hot-loop.
35+
pub fn with_minimum_duration(ttl: Duration) -> Duration {
36+
if ttl < MINIMUM_TTL {
37+
// Choose a deadline; if the expiry is too short, fall back to the minimum TTL.
38+
debug!(ttl.min = ?MINIMUM_TTL, ?ttl, "Given Negative TTL too short, using a minimum TTL");
39+
return MINIMUM_TTL;
40+
}
41+
42+
ttl
43+
}

linkerd/proxy/dns-resolve/src/lib.rs

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ impl<T: Param<Addr>> tower::Service<T> for DnsResolve {
6464
}
6565

6666
async fn resolution(dns: dns::Resolver, na: NameAddr) -> Result<UpdateStream, Error> {
67+
use linkerd_dns::minimum_ttl::with_minimum_expiry;
68+
use tokio::time::sleep_until;
6769
use tokio_stream::wrappers::ReceiverStream;
6870

6971
// Don't return a stream before the initial resolution completes. Then,
@@ -80,7 +82,7 @@ async fn resolution(dns: dns::Resolver, na: NameAddr) -> Result<UpdateStream, Er
8082
trace!("Closed");
8183
return;
8284
}
83-
sleep_until_expired(expiry).await;
85+
sleep_until(with_minimum_expiry(expiry)).await;
8486

8587
loop {
8688
match dns.resolve_addrs(na.name().as_ref(), na.port()).await {
@@ -91,7 +93,7 @@ async fn resolution(dns: dns::Resolver, na: NameAddr) -> Result<UpdateStream, Er
9193
trace!("Closed");
9294
return;
9395
}
94-
sleep_until_expired(expiry).await;
96+
sleep_until(with_minimum_expiry(expiry)).await;
9597
}
9698
Err(error) => {
9799
debug!(%error);
@@ -107,26 +109,3 @@ async fn resolution(dns: dns::Resolver, na: NameAddr) -> Result<UpdateStream, Er
107109

108110
Ok(Box::pin(ReceiverStream::new(rx)))
109111
}
110-
111-
/// Sleep for the provided [`Duration`][tokio::time::Duration].
112-
///
113-
/// NB: This enforces a lower-bound for TTL's to prevent [`resolution()`], above, from spinning
114-
/// in a hot-loop.
115-
#[tracing::instrument(level = "debug")]
116-
async fn sleep_until_expired(valid_until: tokio::time::Instant) {
117-
use tokio::time::{sleep_until, Duration, Instant};
118-
119-
/// The minimum TTL duration that will be respected.
120-
const MINIMUM_TTL: Duration = Duration::from_secs(5);
121-
let minimum = Instant::now() + MINIMUM_TTL;
122-
123-
// Choose a deadline; if the expiry is too short, fall back to the minimum TTL.
124-
let deadline = if valid_until >= minimum {
125-
valid_until
126-
} else {
127-
debug!(ttl.min = ?MINIMUM_TTL, "Given TTL too short, using a minimum TTL");
128-
minimum
129-
};
130-
131-
sleep_until(deadline).await;
132-
}

0 commit comments

Comments
 (0)