Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit a39e025

Browse files
localai-botmudler
andauthored
fix(nodes): make per-node backend install async via gallery job queue (mudler#9928)
* feat(galleryop): add TargetNodeID to ManagementOp for single-node installs Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(galleryop): add NodeScopedKey helpers for per-node opcache rows Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor(galleryop): use strings.Cut for NodeScopedKey parsing, reject empty nodeID Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(nodes): scope DistributedBackendManager.InstallBackend to single node via TargetNodeID Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(http): make /api/nodes/:id/backends/install async via gallery service job queue The handler previously called unloader.InstallBackend synchronously and blocked the browser for up to 3 minutes waiting on the NATS reply. It now enqueues a TargetNodeID-scoped ManagementOp on BackendGalleryChannel and returns HTTP 202 + jobID immediately, matching /api/backends/install/:id. The opcache key is built via NodeScopedKey(nodeID, backend) so concurrent installs of the same backend across different nodes do not stomp each other. galleryService/opcache/appConfig are threaded through RegisterNodeAdminRoutes for this. Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor(http): log malformed backend_galleries override and stop test drain goroutine Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(api): expose nodeID for node-scoped backend ops in /api/operations Node-scoped backend installs land in opcache under "node:<nodeID>:<backend>" keys. Without splitting that prefix back out, the operations panel renders the full key as the display name and has no structured way to label which worker an install is targeting. Detect the prefix, surface nodeID as its own response field, and reduce the display name back to the bare backend slug. Bare (non-scoped) ops are left untouched so legacy installs do not gain a misleading empty nodeID. Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * feat(react-ui): poll job status for node-targeted backend installs Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * fix(react-ui): make NodeInstallPicker state updates pure and surface cancellations as errors Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor(react-ui): clarify async semantics in handleInstallOnTarget Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> * refactor(http): use statusUrl casing for node install response to match codebase precedent Assisted-by: Claude:opus-4-7 [Edit] [Bash] Signed-off-by: Ettore Di Giacinto <[email protected]> --------- Signed-off-by: Ettore Di Giacinto <[email protected]> Co-authored-by: Ettore Di Giacinto <[email protected]>
1 parent 05e8e1e commit a39e025

12 files changed

Lines changed: 590 additions & 60 deletions

File tree

core/http/app.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ func API(application *application.Application) (*echo.Echo, error) {
407407
}
408408
}
409409
routes.RegisterNodeSelfServiceRoutes(e, registry, distCfg.RegistrationToken, distCfg.AutoApproveNodes, application.AuthDB(), application.ApplicationConfig().Auth.APIKeyHMACSecret)
410-
routes.RegisterNodeAdminRoutes(e, registry, remoteUnloader, adminMiddleware, application.AuthDB(), application.ApplicationConfig().Auth.APIKeyHMACSecret, application.ApplicationConfig().Distributed.RegistrationToken)
410+
routes.RegisterNodeAdminRoutes(e, registry, remoteUnloader, application.GalleryService(), opcache, application.ApplicationConfig(), adminMiddleware, application.AuthDB(), application.ApplicationConfig().Auth.APIKeyHMACSecret, application.ApplicationConfig().Distributed.RegistrationToken)
411411

412412
// Distributed SSE routes (job progress + agent events via NATS)
413413
if d := application.Distributed(); d != nil {

core/http/endpoints/localai/nodes.go

Lines changed: 71 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@ import (
1616
"github.com/google/uuid"
1717
"github.com/gorilla/websocket"
1818
"github.com/labstack/echo/v4"
19+
"github.com/mudler/LocalAI/core/config"
20+
"github.com/mudler/LocalAI/core/gallery"
1921
"github.com/mudler/LocalAI/core/http/auth"
2022
"github.com/mudler/LocalAI/core/schema"
23+
"github.com/mudler/LocalAI/core/services/galleryop"
2124
"github.com/mudler/LocalAI/core/services/nodes"
2225
"github.com/mudler/xlog"
2326
"gorm.io/gorm"
@@ -381,14 +384,24 @@ func ResumeNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
381384
}
382385
}
383386

384-
// InstallBackendOnNodeEndpoint triggers backend installation on a worker node via NATS.
387+
// InstallBackendOnNodeEndpoint triggers backend installation on a worker node.
388+
// Async: enqueues a ManagementOp on the gallery service channel and returns a
389+
// jobID immediately. The gallery service worker goroutine drives the actual
390+
// install via DistributedBackendManager.InstallBackend, which honors the op's
391+
// TargetNodeID to scope the fan-out to one node. The UI polls /api/backends/job/:uid
392+
// for progress, mirroring /api/backends/install/:id.
393+
//
385394
// Backend can be either a gallery ID (resolved against BackendGalleries) or a
386-
// direct URI install (URI + Name + optional Alias) same shape as the
395+
// direct URI install (URI + Name + optional Alias) - same shape as the
387396
// standalone /api/backends/install-external path, just scoped to one node.
388-
func InstallBackendOnNodeEndpoint(unloader nodes.NodeCommandSender) echo.HandlerFunc {
397+
//
398+
// The legacy unloader argument is retained for signature symmetry with
399+
// DeleteBackendOnNodeEndpoint / ListBackendsOnNodeEndpoint but is no longer
400+
// used here - the async path goes through galleryService.
401+
func InstallBackendOnNodeEndpoint(_ nodes.NodeCommandSender, galleryService *galleryop.GalleryService, opcache *galleryop.OpCache, appConfig *config.ApplicationConfig) echo.HandlerFunc {
389402
return func(c echo.Context) error {
390-
if unloader == nil {
391-
return c.JSON(http.StatusServiceUnavailable, nodeError(http.StatusServiceUnavailable, "NATS not configured"))
403+
if galleryService == nil {
404+
return c.JSON(http.StatusServiceUnavailable, nodeError(http.StatusServiceUnavailable, "gallery service not configured"))
392405
}
393406
nodeID := c.Param("id")
394407
var req struct {
@@ -401,25 +414,65 @@ func InstallBackendOnNodeEndpoint(unloader nodes.NodeCommandSender) echo.Handler
401414
if err := c.Bind(&req); err != nil {
402415
return c.JSON(http.StatusBadRequest, nodeError(http.StatusBadRequest, "invalid request body"))
403416
}
404-
// Either a gallery backend name or a direct URI must be supplied.
405417
if req.Backend == "" && req.URI == "" {
406418
return c.JSON(http.StatusBadRequest, nodeError(http.StatusBadRequest, "backend name or uri required"))
407419
}
408-
// Admin-driven backend install: not tied to a specific replica slot
409-
// (no model is being loaded). Pass replica 0 to match the worker's
410-
// admin process-key convention (`backend#0`). The worker's fast path
411-
// takes over if the backend is already running — upgrades go through
412-
// the dedicated /api/backends/upgrade path on backend.upgrade.
413-
reply, err := unloader.InstallBackend(nodeID, req.Backend, "", req.BackendGalleries, req.URI, req.Name, req.Alias, 0)
420+
421+
jobUUID, err := uuid.NewUUID()
414422
if err != nil {
415-
xlog.Error("Failed to install backend on node", "node", nodeID, "backend", req.Backend, "uri", req.URI, "error", err)
416-
return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to install backend on node"))
423+
return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "failed to generate job id"))
424+
}
425+
jobID := jobUUID.String()
426+
427+
// Cache key: for gallery installs, use the backend slug; for URI
428+
// installs prefer the provided Name (falling back to URI). All keys
429+
// are node-scoped so concurrent installs of the same backend on
430+
// different nodes do not stomp each other in opcache.
431+
backendKey := req.Backend
432+
if backendKey == "" {
433+
backendKey = req.Name
434+
if backendKey == "" {
435+
backendKey = req.URI
436+
}
417437
}
418-
if !reply.Success {
419-
xlog.Error("Backend install failed on node", "node", nodeID, "backend", req.Backend, "uri", req.URI, "error", reply.Error)
420-
return c.JSON(http.StatusInternalServerError, nodeError(http.StatusInternalServerError, "backend installation failed"))
438+
cacheKey := galleryop.NodeScopedKey(nodeID, backendKey)
439+
opcache.SetBackend(cacheKey, jobID)
440+
441+
// Optional caller-supplied galleries override. Mirrors the standalone
442+
// install path so an admin can point at a private gallery.
443+
galleries := appConfig.BackendGalleries
444+
if req.BackendGalleries != "" {
445+
var custom []config.Gallery
446+
if err := json.Unmarshal([]byte(req.BackendGalleries), &custom); err != nil {
447+
xlog.Warn("Ignoring malformed backend_galleries override; falling back to configured galleries", "error", err, "nodeID", nodeID)
448+
} else if len(custom) > 0 {
449+
galleries = custom
450+
}
451+
}
452+
453+
ctx, cancelFunc := context.WithCancel(context.Background())
454+
op := galleryop.ManagementOp[gallery.GalleryBackend, any]{
455+
ID: jobID,
456+
GalleryElementName: req.Backend,
457+
Galleries: galleries,
458+
TargetNodeID: nodeID,
459+
ExternalURI: req.URI,
460+
ExternalName: req.Name,
461+
ExternalAlias: req.Alias,
462+
Context: ctx,
463+
CancelFunc: cancelFunc,
421464
}
422-
return c.JSON(http.StatusOK, map[string]string{"message": "backend installed"})
465+
galleryService.StoreCancellation(jobID, cancelFunc)
466+
go func() {
467+
galleryService.BackendGalleryChannel <- op
468+
}()
469+
470+
xlog.Info("Node-scoped backend install dispatched", "node", nodeID, "backend", req.Backend, "uri", req.URI, "jobID", jobID)
471+
return c.JSON(http.StatusAccepted, map[string]string{
472+
"jobID": jobID,
473+
"statusUrl": "/api/backends/job/" + jobID,
474+
"message": "backend installation started",
475+
})
423476
}
424477
}
425478

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package localai_test
2+
3+
import (
4+
"bytes"
5+
"encoding/json"
6+
"net/http"
7+
"net/http/httptest"
8+
9+
"github.com/labstack/echo/v4"
10+
. "github.com/onsi/ginkgo/v2"
11+
. "github.com/onsi/gomega"
12+
13+
"github.com/mudler/LocalAI/core/config"
14+
"github.com/mudler/LocalAI/core/gallery"
15+
"github.com/mudler/LocalAI/core/http/endpoints/localai"
16+
"github.com/mudler/LocalAI/core/services/galleryop"
17+
)
18+
19+
// InstallBackendOnNodeEndpoint became async to stop blocking the browser on
20+
// the 3-minute NATS reply timeout. These specs lock in the new contract:
21+
// HTTP 202 with a jobID, a ManagementOp enqueued on the gallery channel, and
22+
// an opcache entry keyed by NodeScopedKey so concurrent installs of the same
23+
// backend on different nodes do not stomp each other.
24+
var _ = Describe("InstallBackendOnNodeEndpoint async behavior", func() {
25+
var (
26+
e *echo.Echo
27+
galleryService *galleryop.GalleryService
28+
opcache *galleryop.OpCache
29+
appCfg *config.ApplicationConfig
30+
dispatched chan galleryop.ManagementOp[gallery.GalleryBackend, any]
31+
done chan struct{}
32+
drainExited chan struct{}
33+
)
34+
35+
BeforeEach(func() {
36+
e = echo.New()
37+
appCfg = &config.ApplicationConfig{
38+
BackendGalleries: []config.Gallery{{Name: "test-gallery", URL: "http://example.com"}},
39+
}
40+
galleryService = galleryop.NewGalleryService(appCfg, nil)
41+
opcache = galleryop.NewOpCache(galleryService)
42+
// Drain the gallery channel into a buffered side channel so the
43+
// handler's `go func() { ch <- op }()` send does not block waiting
44+
// for the real worker (which is not running in this unit test).
45+
dispatched = make(chan galleryop.ManagementOp[gallery.GalleryBackend, any], 4)
46+
done = make(chan struct{})
47+
drainExited = make(chan struct{})
48+
go func() {
49+
defer close(drainExited)
50+
for {
51+
select {
52+
case op := <-galleryService.BackendGalleryChannel:
53+
dispatched <- op
54+
case <-done:
55+
return
56+
}
57+
}
58+
}()
59+
})
60+
61+
AfterEach(func() {
62+
// Signal the drain goroutine to exit. We do NOT close
63+
// BackendGalleryChannel: the handler's dispatch goroutine may still
64+
// be pending (specs that don't Eventually-Receive), and a send on a
65+
// closed channel panics. Signalling via `done` lets the drain
66+
// goroutine return without touching the gallery channel.
67+
close(done)
68+
Eventually(drainExited, "2s").Should(BeClosed())
69+
})
70+
71+
It("returns 202 with a jobID and dispatches a TargetNodeID-scoped op", func() {
72+
body := `{"backend": "llama-cpp"}`
73+
req := httptest.NewRequest(http.MethodPost, "/api/nodes/node-xyz/backends/install", bytes.NewBufferString(body))
74+
req.Header.Set("Content-Type", "application/json")
75+
rec := httptest.NewRecorder()
76+
c := e.NewContext(req, rec)
77+
c.SetParamNames("id")
78+
c.SetParamValues("node-xyz")
79+
80+
handler := localai.InstallBackendOnNodeEndpoint(nil, galleryService, opcache, appCfg)
81+
Expect(handler(c)).To(Succeed())
82+
Expect(rec.Code).To(Equal(http.StatusAccepted))
83+
84+
var resp map[string]any
85+
Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
86+
Expect(resp["jobID"]).To(BeAssignableToTypeOf(""))
87+
Expect(resp["jobID"].(string)).ToNot(BeEmpty())
88+
Expect(resp["message"]).To(Equal("backend installation started"))
89+
90+
Eventually(dispatched, "2s").Should(Receive())
91+
Expect(opcache.Exists(galleryop.NodeScopedKey("node-xyz", "llama-cpp"))).To(BeTrue())
92+
Expect(opcache.IsBackendOp(galleryop.NodeScopedKey("node-xyz", "llama-cpp"))).To(BeTrue())
93+
})
94+
95+
It("returns 400 when neither backend nor uri is supplied", func() {
96+
req := httptest.NewRequest(http.MethodPost, "/api/nodes/node-xyz/backends/install", bytes.NewBufferString(`{}`))
97+
req.Header.Set("Content-Type", "application/json")
98+
rec := httptest.NewRecorder()
99+
c := e.NewContext(req, rec)
100+
c.SetParamNames("id")
101+
c.SetParamValues("node-xyz")
102+
103+
handler := localai.InstallBackendOnNodeEndpoint(nil, galleryService, opcache, appCfg)
104+
Expect(handler(c)).To(Succeed())
105+
Expect(rec.Code).To(Equal(http.StatusBadRequest))
106+
})
107+
108+
It("accepts a direct URI install and uses the name as the cache key", func() {
109+
body := `{"uri": "oci://example.com/custom-backend:v1", "name": "custom"}`
110+
req := httptest.NewRequest(http.MethodPost, "/api/nodes/node-xyz/backends/install", bytes.NewBufferString(body))
111+
req.Header.Set("Content-Type", "application/json")
112+
rec := httptest.NewRecorder()
113+
c := e.NewContext(req, rec)
114+
c.SetParamNames("id")
115+
c.SetParamValues("node-xyz")
116+
117+
handler := localai.InstallBackendOnNodeEndpoint(nil, galleryService, opcache, appCfg)
118+
Expect(handler(c)).To(Succeed())
119+
Expect(rec.Code).To(Equal(http.StatusAccepted))
120+
121+
Expect(opcache.Exists(galleryop.NodeScopedKey("node-xyz", "custom"))).To(BeTrue())
122+
})
123+
})

0 commit comments

Comments
 (0)