@@ -21,9 +21,9 @@ const (
2121 // being it is almost certainly greater than the number of models that most
2222 // developers' systems will be able to load.
2323 maximumRunnerSlots = 8
24- // runnerIdleTimeout is the maximum amount of time that a runner can sit
25- // idle (i.e. without any requests) before being terminated.
26- runnerIdleTimeout = 5 * time .Minute
24+ // defaultRunnerIdleTimeout is the default maximum amount of time that a
25+ // runner can sit idle (i.e. without any requests) before being terminated.
26+ defaultRunnerIdleTimeout = 5 * time .Minute
2727)
2828
2929var (
@@ -59,6 +59,8 @@ type loader struct {
5959 backends map [string ]inference.Backend
6060 // modelManager is the shared model manager.
6161 modelManager * models.Manager
62+ // runnerIdleTimeout is the loader-specific default runner idle timeout.
63+ runnerIdleTimeout time.Duration
6264 // totalMemory is the total system memory allocated to the loader.
6365 totalMemory uint64
6466 // idleCheck is used to signal the run loop when timestamps have updated.
@@ -104,6 +106,19 @@ func newLoader(
104106 // tune this heuristic for systems with enormous amounts of VRAM.
105107 nSlots := min (runtime .NumCPU (), maximumRunnerSlots )
106108
109+ // Check if we have a special environment.
110+ isGPUEnabledCloudEnvironment := environment .Get () == environment .EnvironmentCloud &&
111+ os .Getenv ("NVIDIA_VISIBLE_DEVICES" ) != ""
112+
113+ // Compute the idle runner timeout.
114+ //
115+ // HACK: On GPU-enabled cloud engines, we'll bump this to 8 hours. We can
116+ // remove this once we have configurable TTLs.
117+ runnerIdleTimeout := defaultRunnerIdleTimeout
118+ if isGPUEnabledCloudEnvironment {
119+ runnerIdleTimeout = 8 * time .Hour
120+ }
121+
107122 // Compute the amount of available memory.
108123 //
109124 // TODO: For now, we treat the system as having memory size 1 and all models
@@ -114,28 +129,30 @@ func newLoader(
114129 // computing model size through estimation (using parameter count and
115130 // quantization data type size).
116131 //
117- // HACK: On GPU-enabled cloud engines, we'll temporarily bump this to 2.
132+ // HACK: On GPU-enabled cloud engines, we'll bump this to 2. We can remove
133+ // this once we have VRAM estimation.
118134 totalMemory := uint64 (1 )
119- if environment . Get () == environment . EnvironmentCloud && os . Getenv ( "NVIDIA_VISIBLE_DEVICES" ) != "" {
135+ if isGPUEnabledCloudEnvironment {
120136 totalMemory = 2
121137 }
122138
123139 // Create the loader.
124140 l := & loader {
125- log : log ,
126- backends : backends ,
127- modelManager : modelManager ,
128- totalMemory : totalMemory ,
129- idleCheck : make (chan struct {}, 1 ),
130- guard : make (chan struct {}, 1 ),
131- availableMemory : totalMemory ,
132- waiters : make (map [chan <- struct {}]bool ),
133- runners : make (map [runnerKey ]int , nSlots ),
134- slots : make ([]* runner , nSlots ),
135- references : make ([]uint , nSlots ),
136- allocations : make ([]uint64 , nSlots ),
137- timestamps : make ([]time.Time , nSlots ),
138- runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
141+ log : log ,
142+ backends : backends ,
143+ modelManager : modelManager ,
144+ runnerIdleTimeout : runnerIdleTimeout ,
145+ totalMemory : totalMemory ,
146+ idleCheck : make (chan struct {}, 1 ),
147+ guard : make (chan struct {}, 1 ),
148+ availableMemory : totalMemory ,
149+ waiters : make (map [chan <- struct {}]bool ),
150+ runners : make (map [runnerKey ]int , nSlots ),
151+ slots : make ([]* runner , nSlots ),
152+ references : make ([]uint , nSlots ),
153+ allocations : make ([]uint64 , nSlots ),
154+ timestamps : make ([]time.Time , nSlots ),
155+ runnerConfigs : make (map [runnerKey ]inference.BackendConfiguration ),
139156 }
140157 l .guard <- struct {}{}
141158 return l
@@ -176,7 +193,7 @@ func (l *loader) evict(idleOnly bool) int {
176193 now := time .Now ()
177194 for r , slot := range l .runners {
178195 unused := l .references [slot ] == 0
179- idle := unused && now .Sub (l .timestamps [slot ]) > runnerIdleTimeout
196+ idle := unused && now .Sub (l .timestamps [slot ]) > l . runnerIdleTimeout
180197 defunct := false
181198 select {
182199 case <- l .slots [slot ].done :
@@ -283,7 +300,7 @@ func (l *loader) idleCheckDuration() time.Duration {
283300 // Compute the remaining duration. If negative, check immediately, otherwise
284301 // wait until 100 milliseconds after expiration time (to avoid checking
285302 // right on the expiration boundary).
286- if remaining := runnerIdleTimeout - time .Since (oldest ); remaining < 0 {
303+ if remaining := l . runnerIdleTimeout - time .Since (oldest ); remaining < 0 {
287304 return 0
288305 } else {
289306 return remaining + 100 * time .Millisecond
0 commit comments