Skip to content

Commit ccc2a2b

Browse files
authored
Merge pull request #87 from docker/cloud-ttl-8hour
Temporarily bump GPU-enabled cloud idle timeout to 8 hours.
2 parents 24a2a4b + e0ddfdf commit ccc2a2b

File tree

1 file changed

+38
-21
lines changed

1 file changed

+38
-21
lines changed

pkg/inference/scheduling/loader.go

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ const (
2121
// being it is almost certainly greater than the number of models that most
2222
// developers' systems will be able to load.
2323
maximumRunnerSlots = 8
24-
// runnerIdleTimeout is the maximum amount of time that a runner can sit
25-
// idle (i.e. without any requests) before being terminated.
26-
runnerIdleTimeout = 5 * time.Minute
24+
// defaultRunnerIdleTimeout is the default maximum amount of time that a
25+
// runner can sit idle (i.e. without any requests) before being terminated.
26+
defaultRunnerIdleTimeout = 5 * time.Minute
2727
)
2828

2929
var (
@@ -59,6 +59,8 @@ type loader struct {
5959
backends map[string]inference.Backend
6060
// modelManager is the shared model manager.
6161
modelManager *models.Manager
62+
// runnerIdleTimeout is the loader-specific default runner idle timeout.
63+
runnerIdleTimeout time.Duration
6264
// totalMemory is the total system memory allocated to the loader.
6365
totalMemory uint64
6466
// idleCheck is used to signal the run loop when timestamps have updated.
@@ -104,6 +106,19 @@ func newLoader(
104106
// tune this heuristic for systems with enormous amounts of VRAM.
105107
nSlots := min(runtime.NumCPU(), maximumRunnerSlots)
106108

109+
// Check if we have a special environment.
110+
isGPUEnabledCloudEnvironment := environment.Get() == environment.EnvironmentCloud &&
111+
os.Getenv("NVIDIA_VISIBLE_DEVICES") != ""
112+
113+
// Compute the idle runner timeout.
114+
//
115+
// HACK: On GPU-enabled cloud engines, we'll bump this to 8 hours. We can
116+
// remove this once we have configurable TTLs.
117+
runnerIdleTimeout := defaultRunnerIdleTimeout
118+
if isGPUEnabledCloudEnvironment {
119+
runnerIdleTimeout = 8 * time.Hour
120+
}
121+
107122
// Compute the amount of available memory.
108123
//
109124
// TODO: For now, we treat the system as having memory size 1 and all models
@@ -114,28 +129,30 @@ func newLoader(
114129
// computing model size through estimation (using parameter count and
115130
// quantization data type size).
116131
//
117-
// HACK: On GPU-enabled cloud engines, we'll temporarily bump this to 2.
132+
// HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
133+
// this once we have VRAM estimation.
118134
totalMemory := uint64(1)
119-
if environment.Get() == environment.EnvironmentCloud && os.Getenv("NVIDIA_VISIBLE_DEVICES") != "" {
135+
if isGPUEnabledCloudEnvironment {
120136
totalMemory = 2
121137
}
122138

123139
// Create the loader.
124140
l := &loader{
125-
log: log,
126-
backends: backends,
127-
modelManager: modelManager,
128-
totalMemory: totalMemory,
129-
idleCheck: make(chan struct{}, 1),
130-
guard: make(chan struct{}, 1),
131-
availableMemory: totalMemory,
132-
waiters: make(map[chan<- struct{}]bool),
133-
runners: make(map[runnerKey]int, nSlots),
134-
slots: make([]*runner, nSlots),
135-
references: make([]uint, nSlots),
136-
allocations: make([]uint64, nSlots),
137-
timestamps: make([]time.Time, nSlots),
138-
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
141+
log: log,
142+
backends: backends,
143+
modelManager: modelManager,
144+
runnerIdleTimeout: runnerIdleTimeout,
145+
totalMemory: totalMemory,
146+
idleCheck: make(chan struct{}, 1),
147+
guard: make(chan struct{}, 1),
148+
availableMemory: totalMemory,
149+
waiters: make(map[chan<- struct{}]bool),
150+
runners: make(map[runnerKey]int, nSlots),
151+
slots: make([]*runner, nSlots),
152+
references: make([]uint, nSlots),
153+
allocations: make([]uint64, nSlots),
154+
timestamps: make([]time.Time, nSlots),
155+
runnerConfigs: make(map[runnerKey]inference.BackendConfiguration),
139156
}
140157
l.guard <- struct{}{}
141158
return l
@@ -176,7 +193,7 @@ func (l *loader) evict(idleOnly bool) int {
176193
now := time.Now()
177194
for r, slot := range l.runners {
178195
unused := l.references[slot] == 0
179-
idle := unused && now.Sub(l.timestamps[slot]) > runnerIdleTimeout
196+
idle := unused && now.Sub(l.timestamps[slot]) > l.runnerIdleTimeout
180197
defunct := false
181198
select {
182199
case <-l.slots[slot].done:
@@ -283,7 +300,7 @@ func (l *loader) idleCheckDuration() time.Duration {
283300
// Compute the remaining duration. If negative, check immediately, otherwise
284301
// wait until 100 milliseconds after expiration time (to avoid checking
285302
// right on the expiration boundary).
286-
if remaining := runnerIdleTimeout - time.Since(oldest); remaining < 0 {
303+
if remaining := l.runnerIdleTimeout - time.Since(oldest); remaining < 0 {
287304
return 0
288305
} else {
289306
return remaining + 100*time.Millisecond

0 commit comments

Comments
 (0)