From 530e04743ed8ae068623074bef553500fd7bb254 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Thu, 28 May 2026 15:53:14 +0100 Subject: [PATCH 1/3] fix(router): score classifier production-readiness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conversation trimming runs through the classifier model's chat template and trims by exact token count, sized to the model's n_batch which is now scaled to context so long probes can't crash the backend. Missing chat_message templates are a hard error at router build time. Router- facing factories (Embedder/Scorer/Reranker/TokenCounter) re-resolve ModelConfig per call so a model installed post-startup doesn't bind a stub Backend="" config and silently fall into the loader's auto- iterate path. New 'vector_store' backend trace recorded inside localVectorStore on every Search/Insert — including the backend-load-failure path that previously vanished into an xlog.Warn — with outcome tagging (hit/miss/empty_store/backend_load_error/find_error/insert_error/ok). Companion cleanup drops misleading similarity:0 and input_tokens_count:0 from non-hit and text-mode traces. Gallery local-store-development aliases to 'local-store' so the master image satisfies pkg/model.LocalStoreBackend lookups from the embedding cache. Misc: llama-cpp TokenizeString reads the correct 'prompt' JSON key (the original bug); ModelTokenize nil-guard; non-fatal mitm proxy startup; PII 'route_local' renamed to 'allow' with docs/UI in sync; model-editor footer no longer eats the edit area on small screens; several config-editor template/dropdown/section fixes. Tests: e2e router specs (casual/code-hint + long-conversation trim), vector_store trace specs, lazy-factory specs, gallery dev-alias resolution, Playwright trace badge + scroll regression. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Richard Palethorpe --- .github/workflows/test-extra.yml | 2 +- backend/cpp/llama-cpp/grpc-server.cpp | 2 +- backend/index.yaml | 2 + core/application/mitm.go | 23 +++ core/application/mitm_test.go | 58 ++++++ core/application/router_factories.go | 109 ++++++++--- core/application/router_factories_test.go | 155 +++++++++++++++ core/application/startup.go | 6 +- core/backend/embeddings.go | 9 +- core/backend/options.go | 33 ++-- core/backend/options_internal_test.go | 45 +++++ core/backend/stores.go | 90 +++++++-- core/backend/stores_test.go | 88 +++++++++ core/backend/tokenize.go | 23 ++- core/backend/tokenize_test.go | 27 +++ core/config/application_config.go | 2 +- core/config/meta/build.go | 3 + core/config/meta/constants.go | 1 + core/config/meta/registry.go | 60 +++--- core/config/meta/registry_coverage_test.go | 1 - core/config/meta/types.go | 6 + core/config/model_config.go | 24 ++- core/config/model_config_test.go | 12 ++ core/gallery/backends_test.go | 9 +- core/http/endpoints/anthropic/messages.go | 2 +- .../endpoints/localai/api_instructions.go | 2 +- core/http/endpoints/localai/config_meta.go | 2 + core/http/endpoints/localai/pii_decide.go | 24 +-- .../http/endpoints/localai/pii_decide_test.go | 21 ++- core/http/endpoints/openai/completion.go | 2 +- core/http/endpoints/openai/realtime_model.go | 23 +-- core/http/middleware/probe_trim_test.go | 139 ++++++++++++++ core/http/middleware/route_model.go | 163 +++++++++++----- core/http/middleware/route_model_test.go | 31 ++- core/http/react-ui/e2e/model-config.spec.js | 34 ++++ .../e2e/model-editor-back-nav.spec.js | 94 +++++++++ core/http/react-ui/e2e/traces-errors.spec.js | 74 ++++++++ core/http/react-ui/playwright.config.js | 6 + .../react-ui/src/components/CodeEditor.jsx | 27 +-- .../src/components/ConfigFieldRenderer.jsx | 3 +- .../src/components/PIIPatternListEditor.jsx | 2 +- core/http/react-ui/src/pages/AgentJobs.jsx | 6 +- core/http/react-ui/src/pages/Chat.jsx | 6 +- core/http/react-ui/src/pages/Manage.jsx | 6 +- core/http/react-ui/src/pages/Middleware.jsx | 41 ++-- core/http/react-ui/src/pages/ModelEditor.jsx | 65 ++++--- core/http/react-ui/src/pages/Models.jsx | 6 +- core/http/react-ui/src/pages/Talk.jsx | 8 +- core/http/react-ui/src/pages/Traces.jsx | 1 + core/http/react-ui/src/utils/cmGoTemplate.js | 46 +++++ core/http/react-ui/src/utils/editorNav.js | 15 ++ core/http/routes/anthropic.go | 15 +- core/http/routes/middleware.go | 19 +- core/http/routes/openai.go | 15 +- core/http/routes/pii.go | 12 +- core/schema/localai.go | 13 +- core/services/cloudproxy/backend_forward.go | 2 +- core/services/routing/pii/config.go | 2 +- core/services/routing/pii/config_test.go | 4 +- core/services/routing/pii/middleware.go | 18 +- core/services/routing/pii/middleware_test.go | 16 +- core/services/routing/pii/redactor.go | 20 +- core/services/routing/pii/redactor_test.go | 10 +- core/services/routing/pii/stream.go | 5 +- core/services/routing/pii/types.go | 36 ++-- .../routing/router/embedding_cache.go | 15 +- .../routing/router/embedding_cache_test.go | 55 ++++++ core/services/routing/router/rerank.go | 19 +- core/services/routing/router/rerank_test.go | 27 +++ core/services/routing/router/score.go | 30 ++- core/services/routing/router/score_test.go | 137 ++++++++++++++ core/services/routing/router/trim.go | 178 ++++++++++++++++++ core/services/routing/router/types.go | 9 + core/trace/backend_trace.go | 1 + docs/content/features/middleware.md | 17 +- pkg/mcp/localaitools/dto.go | 14 +- pkg/mcp/localaitools/inproc/client.go | 6 +- pkg/mcp/localaitools/tools_middleware.go | 4 +- pkg/mcp/localaitools/tools_pii.go | 2 +- tests/e2e-backends/backend_test.go | 18 ++ tests/e2e/e2e_router_test.go | 90 +++++++++ tests/e2e/e2e_suite_test.go | 59 ++++++ tests/e2e/mock-backend/main.go | 103 +++++++++- 83 files changed, 2219 insertions(+), 391 deletions(-) create mode 100644 core/application/mitm_test.go create mode 100644 core/application/router_factories_test.go create mode 100644 core/backend/stores_test.go create mode 100644 core/backend/tokenize_test.go create mode 100644 core/http/middleware/probe_trim_test.go create mode 100644 core/http/react-ui/e2e/model-editor-back-nav.spec.js create mode 100644 core/http/react-ui/src/utils/cmGoTemplate.js create mode 100644 core/http/react-ui/src/utils/editorNav.js create mode 100644 core/services/routing/router/trim.go create mode 100644 tests/e2e/e2e_router_test.go diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index 12c186ca2870..3296c6d84578 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -563,7 +563,7 @@ jobs: - name: Run e2e-backends smoke env: BACKEND_IMAGE: quay.io/go-skynet/local-ai-backends:master-cpu-llama-cpp - BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias + BACKEND_TEST_CAPS: health,load,predict,stream,logprobs,logit_bias,tokenize run: | make test-extra-backend # Realtime e2e with sherpa-onnx driving VAD + STT + TTS against a mocked LLM. diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 2ca329134fc4..c9effdb471fd 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -3436,7 +3436,7 @@ class BackendServiceImpl final : public backend::Backend::Service { if (body.count("prompt") != 0) { const bool add_special = json_value(body, "add_special", false); - llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.impl->vocab, body.at("prompt"), add_special, true); for (const auto& token : tokens) { diff --git a/backend/index.yaml b/backend/index.yaml index 37e6890710e4..d2ced5d356f6 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -1557,6 +1557,7 @@ - localai/localai-backends:master-metal-darwin-arm64-kitten-tts - !!merge <<: *local-store name: "local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-cpu-local-store" mirrors: - localai/localai-backends:master-cpu-local-store @@ -1567,6 +1568,7 @@ - localai/localai-backends:latest-metal-darwin-arm64-local-store - !!merge <<: *local-store name: "metal-local-store-development" + alias: "local-store" uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-local-store" mirrors: - localai/localai-backends:master-metal-darwin-arm64-local-store diff --git a/core/application/mitm.go b/core/application/mitm.go index 293b3d449c20..a67a68934ae0 100644 --- a/core/application/mitm.go +++ b/core/application/mitm.go @@ -11,6 +11,29 @@ import ( "github.com/mudler/xlog" ) +// startMITMIfConfigured brings up the cloudproxy MITM listener when an +// address is configured, treating any startup failure as non-fatal. +// +// The listener is opt-in middleware whose address is persisted in runtime +// settings (/api/settings → runtime_settings.json) and replayed on every +// boot. A bad value — e.g. a host the process can't bind, like a LAN IP +// inside a container — must NOT abort the whole server: doing so crash-loops +// with no way out, because the Settings UI used to correct the address can't +// load if startup never completes. So on failure we log loudly and carry on; +// the admin fixes the address via /api/settings, which calls RestartMITM. +func startMITMIfConfigured(app *Application, options *config.ApplicationConfig) { + if options.MITMListen == "" { + return + } + if err := startMITMProxy(app, options); err != nil { + xlog.Error("mitm: cloudproxy listener failed to start — continuing without it", + "listen", options.MITMListen, + "error", err, + "hint", "fix the address via Settings (e.g. \":8082\" to bind all interfaces) and the listener will restart", + ) + } +} + func startMITMProxy(app *Application, options *config.ApplicationConfig) error { app.mitmMutex.Lock() defer app.mitmMutex.Unlock() diff --git a/core/application/mitm_test.go b/core/application/mitm_test.go new file mode 100644 index 000000000000..b7627fa2d66c --- /dev/null +++ b/core/application/mitm_test.go @@ -0,0 +1,58 @@ +package application + +import ( + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// minimal Application wired enough for startMITMProxy: an empty model +// config loader (no host claims), CA written under a temp DataPath. +func newMITMTestApp(dataPath string) (*Application, *config.ApplicationConfig) { + state, err := system.GetSystemState() + Expect(err).NotTo(HaveOccurred()) + state.Model.ModelsPath = dataPath + opts := config.NewApplicationConfig( + config.WithSystemState(state), + config.WithDataPath(dataPath), + ) + return newApplication(opts), opts +} + +var _ = Describe("startMITMIfConfigured", func() { + It("does nothing when no listen address is configured", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "no listener should be stored when disabled") + }) + + // Regression: a persisted-but-unbindable MITM address (e.g. a LAN host + // inside a container) must not abort startup. startMITMIfConfigured + // swallows the bind error so the rest of LocalAI still comes up and the + // admin can fix the address via the Settings UI. + It("logs and continues when the listen address cannot be bound", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + // 192.0.2.1 is TEST-NET-1 (RFC 5737): guaranteed not assigned to any + // local interface, so bind fails deterministically without DNS. + opts.MITMListen = "192.0.2.1:8082" + + Expect(func() { startMITMIfConfigured(app, opts) }).NotTo(Panic()) + Expect(app.mitmServer.Load()).To(BeNil(), "failed listener must not be stored") + }) + + It("starts and stores the listener on a bindable address", func() { + app, opts := newMITMTestApp(GinkgoT().TempDir()) + opts.MITMListen = "127.0.0.1:0" // OS-assigned free port + + startMITMIfConfigured(app, opts) + + srv := app.mitmServer.Load() + Expect(srv).NotTo(BeNil(), "listener should be stored on success") + DeferCleanup(srv.Stop) + Expect(srv.Addr()).NotTo(BeEmpty()) + }) +}) diff --git a/core/application/router_factories.go b/core/application/router_factories.go index d37cfb9d8115..879c43a835ee 100644 --- a/core/application/router_factories.go +++ b/core/application/router_factories.go @@ -1,63 +1,120 @@ package application import ( + "context" + "fmt" + "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" ) -// adapterConfig resolves a model name to its runtime ModelConfig, or -// nil when the name is unknown. Shared by the router-facing factories -// below and by ModelConfigLookup. +// adapterConfig resolves a model name to its runtime ModelConfig, or nil when +// unknown. LoadModelConfigFileByNameDefaultOptions never returns nil — for an +// unknown name it returns a defaults-filled stub with an empty Name (the YAML +// `name:` field is required by Validate), which is how we tell the two apart. func (a *Application) adapterConfig(modelName string) *config.ModelConfig { cfg, err := a.backendLoader.LoadModelConfigFileByNameDefaultOptions(modelName, a.applicationConfig) - if err != nil || cfg == nil { + if err != nil || cfg == nil || cfg.Name == "" { return nil } return cfg } -// ModelConfigLookup is the lookup function the router middleware's -// classifier validator uses to confirm classifier_model declares -// FLAG_SCORE before binding it. +// ModelConfigLookup is the lookup the router middleware's classifier validator +// uses to confirm classifier_model declares FLAG_SCORE before binding it. func (a *Application) ModelConfigLookup() func(modelName string) *config.ModelConfig { return a.adapterConfig } -// Scorer returns a backend.Scorer bound to the named model, or nil -// when the model is unknown. Used as a method value (app.Scorer) by -// router.ClassifierDeps — no factory-of-factory wrapper needed. +// The router-facing factories below (Scorer, Embedder, Reranker, TokenCounter) +// bind a model NAME at construction and re-resolve the CONFIG on every call. +// Capturing the config at construction would bake in whatever state +// adapterConfig saw first — including a stub returned before the YAML reached +// bcl.configs (e.g. /import-model or gallery install racing startup). The +// classifier registry caches factories by router-config fingerprint, so a +// once-stale capture stays stale until the router config is edited. + func (a *Application) Scorer(modelName string) backend.Scorer { - cfg := a.adapterConfig(modelName) + if a.adapterConfig(modelName) == nil { + return nil + } + return &lazyScorer{app: a, modelName: modelName} +} + +type lazyScorer struct { + app *Application + modelName string +} + +func (l *lazyScorer) Score(ctx context.Context, prompt string, candidates []string) ([]backend.CandidateScore, error) { + cfg := l.app.adapterConfig(l.modelName) if cfg == nil { + return nil, fmt.Errorf("scorer: model %q no longer available", l.modelName) + } + return backend.NewScorer(l.app.modelLoader, *cfg, l.app.applicationConfig).Score(ctx, prompt, candidates) +} + +// TokenCounter returns a func so the middleware's literal field type accepts +// it as a method value without importing core/http/middleware from here. +func (a *Application) TokenCounter(modelName string) func(string) (int, error) { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewScorer(a.modelLoader, *cfg, a.applicationConfig) + return func(text string) (int, error) { + cfg := a.adapterConfig(modelName) + if cfg == nil { + return 0, fmt.Errorf("token counter: model %q no longer available", modelName) + } + resp, err := backend.ModelTokenize(text, a.modelLoader, *cfg, a.applicationConfig) + if err != nil { + return 0, err + } + return len(resp.Tokens), nil + } } -// Reranker returns a backend.Reranker bound to the named model, or -// nil when unknown. The reranker model's `type:` (e.g. "colbert") -// selects the scoring head inside the rerankers backend. func (a *Application) Reranker(modelName string) backend.Reranker { - cfg := a.adapterConfig(modelName) - if cfg == nil { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewReranker(a.modelLoader, *cfg, a.applicationConfig) + return &lazyReranker{app: a, modelName: modelName} } -// Embedder returns a backend.Embedder bound to the named model, or -// nil when unknown. Used by the router's L2 embedding cache. -func (a *Application) Embedder(modelName string) backend.Embedder { - cfg := a.adapterConfig(modelName) +type lazyReranker struct { + app *Application + modelName string +} + +func (l *lazyReranker) Rerank(ctx context.Context, query string, documents []string) ([]backend.RerankResult, error) { + cfg := l.app.adapterConfig(l.modelName) if cfg == nil { + return nil, fmt.Errorf("reranker: model %q no longer available", l.modelName) + } + return backend.NewReranker(l.app.modelLoader, *cfg, l.app.applicationConfig).Rerank(ctx, query, documents) +} + +func (a *Application) Embedder(modelName string) backend.Embedder { + if a.adapterConfig(modelName) == nil { return nil } - return backend.NewEmbedder(a.modelLoader, *cfg, a.applicationConfig) + return &lazyEmbedder{app: a, modelName: modelName} +} + +type lazyEmbedder struct { + app *Application + modelName string +} + +func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error) { + cfg := l.app.adapterConfig(l.modelName) + if cfg == nil { + return nil, fmt.Errorf("embedder: model %q no longer available", l.modelName) + } + return backend.NewEmbedder(l.app.modelLoader, *cfg, l.app.applicationConfig).Embed(ctx, text) } -// VectorStore returns a backend.VectorStore for the named collection, -// or nil when the name is empty. Each router model gets its own -// backend process via the model loader's cache keyed by storeName. +// VectorStore takes a store name, not a model name — no adapterConfig, no +// staleness to avoid. func (a *Application) VectorStore(storeName string) backend.VectorStore { return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName) } diff --git a/core/application/router_factories_test.go b/core/application/router_factories_test.go new file mode 100644 index 000000000000..5a6988a88fba --- /dev/null +++ b/core/application/router_factories_test.go @@ -0,0 +1,155 @@ +package application + +import ( + "context" + "os" + "path/filepath" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// Regression: the router-facing factories used to capture +// *config.ModelConfig at construction. A gallery install that raced +// startup left a stub (Backend="") bound for the lifetime of the +// classifier registry's cache entry, bypassing the user's `backend:` +// config. These specs pin the lazy re-resolve. +var _ = Describe("router_factories lazy config resolution", func() { + var ( + tmpDir string + app *Application + ) + + BeforeEach(func() { + var err error + tmpDir, err = os.MkdirTemp("", "router-factories-*") + Expect(err).NotTo(HaveOccurred()) + + appCfg := &config.ApplicationConfig{ + Context: context.Background(), + SystemState: &system.SystemState{Model: system.Model{ModelsPath: tmpDir}}, + } + app = &Application{ + backendLoader: config.NewModelConfigLoader(tmpDir), + modelLoader: model.NewModelLoader(appCfg.SystemState), + applicationConfig: appCfg, + } + }) + + AfterEach(func() { + _ = os.RemoveAll(tmpDir) + }) + + // writeCfg seeds both the on-disk YAML and the in-memory cache — + // removing only the cache would fall through to file-read. + writeCfg := func(name, backend string) { + yaml := "name: " + name + "\nbackend: " + backend + "\nparameters:\n model: " + name + ".bin\n" + Expect(os.WriteFile(filepath.Join(tmpDir, name+".yaml"), []byte(yaml), 0644)).To(Succeed()) + Expect(app.backendLoader.LoadModelConfigsFromPath(tmpDir)).To(Succeed()) + cfg, ok := app.backendLoader.GetModelConfig(name) + Expect(ok).To(BeTrue(), "config must be loaded before the spec runs") + Expect(cfg.Backend).To(Equal(backend)) + } + + // removeCfg purges both the cache and the YAML so LoadModelConfigFileByName + // returns the empty-stub case and adapterConfig returns nil. + removeCfg := func(name string) { + app.backendLoader.RemoveModelConfig(name) + Expect(os.Remove(filepath.Join(tmpDir, name+".yaml"))).To(Succeed()) + } + + Context("Embedder", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Embedder("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Embed call", func() { + writeCfg("emb-test", "llama-cpp") + emb := app.Embedder("emb-test") + Expect(emb).NotTo(BeNil()) + + // The factory must hold the NAME, not a captured config — + // otherwise stale captures survive cache invalidation. + lazy, ok := emb.(*lazyEmbedder) + Expect(ok).To(BeTrue(), "Embedder must return *lazyEmbedder") + Expect(lazy.modelName).To(Equal("emb-test")) + + // Mutate the cached config. A lazy implementation sees the + // update on the next adapterConfig call; a captured-at- + // construction implementation would still see "llama-cpp". + app.backendLoader.UpdateModelConfig("emb-test", func(c *config.ModelConfig) { + c.Backend = "rerankers" + }) + Expect(lazy.app.adapterConfig("emb-test").Backend).To(Equal("rerankers")) + + // Remove the config entirely → Embed must surface the disappearance. + removeCfg("emb-test") + _, err := emb.Embed(context.Background(), "anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Scorer", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Scorer("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Score call", func() { + writeCfg("score-test", "llama-cpp") + sc := app.Scorer("score-test") + Expect(sc).NotTo(BeNil()) + + lazy, ok := sc.(*lazyScorer) + Expect(ok).To(BeTrue(), "Scorer must return *lazyScorer") + Expect(lazy.modelName).To(Equal("score-test")) + + removeCfg("score-test") + _, err := sc.Score(context.Background(), "prompt", []string{"a"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("Reranker", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.Reranker("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each Rerank call", func() { + writeCfg("rerank-test", "rerankers") + rr := app.Reranker("rerank-test") + Expect(rr).NotTo(BeNil()) + + lazy, ok := rr.(*lazyReranker) + Expect(ok).To(BeTrue(), "Reranker must return *lazyReranker") + Expect(lazy.modelName).To(Equal("rerank-test")) + + removeCfg("rerank-test") + _, err := rr.Rerank(context.Background(), "q", []string{"d"}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) + + Context("TokenCounter", func() { + It("returns nil at construction for an unknown model", func() { + Expect(app.TokenCounter("missing")).To(BeNil()) + }) + + It("re-resolves the model config on each call", func() { + writeCfg("tok-test", "llama-cpp") + tc := app.TokenCounter("tok-test") + Expect(tc).NotTo(BeNil()) + + removeCfg("tok-test") + _, err := tc("anything") + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("no longer available")) + }) + }) +}) diff --git a/core/application/startup.go b/core/application/startup.go index be559479f2f8..83f73db5ee42 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -441,11 +441,7 @@ func New(opts ...config.AppOption) (*Application, error) { // traffic doesn't need a parallel config for MITM traffic. // Runs after loadRuntimeSettingsFromFile so a listener configured // via /api/settings is brought back up across restarts. - if options.MITMListen != "" { - if err := startMITMProxy(application, options); err != nil { - return nil, fmt.Errorf("mitm: startup: %w", err) - } - } + startMITMIfConfigured(application, options) application.ModelLoader().SetBackendLoggingEnabled(options.EnableBackendLogging) diff --git a/core/backend/embeddings.go b/core/backend/embeddings.go index 4be2bc346ef9..eff88ef04b19 100644 --- a/core/backend/embeddings.go +++ b/core/backend/embeddings.go @@ -100,8 +100,13 @@ func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.M trace.InitBackendTracingIfEnabled(appConfig.TracingMaxItems, appConfig.TracingMaxBodyBytes) traceData := map[string]any{ - "input_text": trace.TruncateString(s, 1000), - "input_tokens_count": len(tokens), + "input_text": trace.TruncateString(s, 1000), + } + // Only present for token-mode callers (pre-tokenized override); + // emitting "0" alongside input_text would read as "consumed zero + // tokens", which is wrong. + if len(tokens) > 0 { + traceData["input_tokens_count"] = len(tokens) } startTime := time.Now() diff --git a/core/backend/options.go b/core/backend/options.go index c891b6d6729a..db7f5ef96528 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -88,9 +88,23 @@ func getSeed(c config.ModelConfig) int32 { } func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { + // Resolve context size first — the backend defaults to 4096 when unset, + // and batch sizing below has to match that effective value or the + // FLAG_SCORE guard misses the n_batch < n_ctx GGML_ASSERT crash. + ctxSize := 4096 + if c.ContextSize != nil { + ctxSize = *c.ContextSize + } + b := 512 if c.Batch != 0 { b = c.Batch + } else if c.HasUsecases(config.FLAG_SCORE) && ctxSize > b { + // Score models decode prompt+candidate in one llama_decode which + // asserts n_tokens <= n_batch and aborts on failure. Sizing the + // batch to n_ctx means anything that fits the context fits one + // decode. Explicit `batch:` in the config still wins. + b = ctxSize } flashAttention := "auto" @@ -134,11 +148,6 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { } } - ctxSize := 4096 - if c.ContextSize != nil { - ctxSize = *c.ContextSize - } - mmlock := false if c.MMlock != nil { mmlock = *c.MMlock @@ -239,13 +248,13 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { if c.Backend == "cloud-proxy" { opts.Proxy = &pb.ProxyOptions{ - UpstreamUrl: c.Proxy.UpstreamURL, - Mode: c.Proxy.Mode, - Provider: c.Proxy.Provider, - ApiKeyEnv: c.Proxy.APIKeyEnv, - ApiKeyFile: c.Proxy.APIKeyFile, - UpstreamModel: c.Proxy.UpstreamModel, - RequestTimeoutSeconds: int32(c.Proxy.RequestTimeoutSeconds), + UpstreamUrl: c.Proxy.UpstreamURL, + Mode: c.Proxy.Mode, + Provider: c.Proxy.Provider, + ApiKeyEnv: c.Proxy.APIKeyEnv, + ApiKeyFile: c.Proxy.APIKeyFile, + UpstreamModel: c.Proxy.UpstreamModel, + RequestTimeoutSeconds: int32(c.Proxy.RequestTimeoutSeconds), } } diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index af5d59992ce0..5712c5c75994 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -75,3 +75,48 @@ var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() { Expect(opts.Metadata).ToNot(HaveKey("enable_thinking")) }) }) + +var _ = Describe("grpcModelOpts NBatch", func() { + scoreUsecase := config.FLAG_SCORE + threads := 1 + ctx := 4096 + + It("defaults to 512 for an ordinary model", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the context window for score models", func() { + // Score models decode the whole prompt+candidate in one + // llama_decode; n_batch must cover it or the backend aborts. + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + }) + + It("keeps an explicit batch over the score default", func() { + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &ctx}, KnownUsecases: &scoreUsecase} + cfg.Batch = 1024 + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(1024)) + }) + + It("does not raise the batch when a score model's context is below the default", func() { + small := 256 + cfg := config.ModelConfig{Threads: &threads, LLMConfig: config.LLMConfig{ContextSize: &small}, KnownUsecases: &scoreUsecase} + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(512)) + }) + + It("sizes the batch to the effective 4096 default for a score model with no explicit context_size", func() { + // The crash case: the backend defaults n_ctx to 4096, so n_batch must + // follow even when context_size is unset — otherwise n_batch stays 512 + // against a 4096 window and the score decode hits the GGML_ASSERT. + cfg := config.ModelConfig{Threads: &threads, KnownUsecases: &scoreUsecase} + Expect(cfg.ContextSize).To(BeNil()) + opts := grpcModelOpts(cfg, "/tmp/models") + Expect(opts.NBatch).To(BeEquivalentTo(4096)) + Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives") + }) +}) diff --git a/core/backend/stores.go b/core/backend/stores.go index 4884765f2f93..8b73ee17c017 100644 --- a/core/backend/stores.go +++ b/core/backend/stores.go @@ -3,9 +3,10 @@ package backend import ( "context" "fmt" - "strings" + "time" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" "github.com/mudler/LocalAI/pkg/model" @@ -39,34 +40,85 @@ func (s *localVectorStore) backend(_ context.Context) (grpc.Backend, error) { return StoreBackend(s.loader, s.appConfig, s.storeName, "") } -func (s *localVectorStore) Search(ctx context.Context, vec []float32) (float64, []byte, bool, error) { - be, err := s.backend(ctx) - if err != nil { - return 0, nil, false, fmt.Errorf("vector store load: %w", err) +func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float64, payload []byte, ok bool, err error) { + start := time.Now() + outcome := "hit" + defer func() { + s.recordTrace(start, "search", len(vec), sim, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return 0, nil, false, fmt.Errorf("vector store load: %w", berr) } - _, values, similarities, err := store.Find(ctx, be, vec, 1) - if err != nil { - // local-store's Find returns "existing length is -1" before - // any keys are inserted. Surface that as a clean miss so the - // cache layer treats it as an empty store and proceeds to - // Insert rather than skipping. - if strings.Contains(err.Error(), "existing length is -1") { - return 0, nil, false, nil - } - return 0, nil, false, fmt.Errorf("vector store find: %w", err) + _, values, similarities, ferr := store.Find(ctx, be, vec, 1) + if ferr != nil { + outcome = "find_error" + return 0, nil, false, fmt.Errorf("vector store find: %w", ferr) } if len(values) == 0 || len(similarities) == 0 { + outcome = "miss" return 0, nil, false, nil } return float64(similarities[0]), values[0], true, nil } -func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) error { - be, err := s.backend(ctx) +func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) { + start := time.Now() + outcome := "ok" + defer func() { + s.recordTrace(start, "insert", len(vec), 0, outcome, err) + }() + be, berr := s.backend(ctx) + if berr != nil { + outcome = "backend_load_error" + return fmt.Errorf("vector store load: %w", berr) + } + if serr := store.SetSingle(ctx, be, vec, payload); serr != nil { + outcome = "insert_error" + return serr + } + return nil +} + +// recordTrace surfaces vector-store calls in /api/backend-traces, including +// the backend-load-failure path that otherwise vanishes into an xlog.Warn. +// modelName uses the store namespace (e.g. "router-cache-smart-router") so +// admins can tell which router's cache misbehaved; the backend is always +// "local-store" and can't disambiguate. +func (s *localVectorStore) recordTrace(start time.Time, op string, vecDim int, sim float64, outcome string, err error) { + if s.appConfig == nil || !s.appConfig.EnableTracing { + return + } + trace.InitBackendTracingIfEnabled(s.appConfig.TracingMaxItems, s.appConfig.TracingMaxBodyBytes) + errStr := "" if err != nil { - return fmt.Errorf("vector store load: %w", err) + errStr = err.Error() + } + summary := op + " " + outcome + if op == "search" && outcome == "hit" { + summary = fmt.Sprintf("search hit (sim=%.3f)", sim) + } + data := map[string]any{ + "op": op, + "outcome": outcome, + "vector_dim": vecDim, + } + // Only include similarity for a real neighbor — miss/empty_store would + // otherwise render "similarity: 0" and read as a measured value. + if op == "search" && outcome == "hit" { + data["similarity"] = sim } - return store.SetSingle(ctx, be, vec, payload) + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: start, + Duration: time.Since(start), + Type: trace.BackendTraceVectorStore, + ModelName: s.storeName, + Backend: model.LocalStoreBackend, + Summary: summary, + Error: errStr, + Data: data, + }) } func StoreBackend(sl *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string, backend string) (grpc.Backend, error) { diff --git a/core/backend/stores_test.go b/core/backend/stores_test.go new file mode 100644 index 000000000000..e9d5208a3d45 --- /dev/null +++ b/core/backend/stores_test.go @@ -0,0 +1,88 @@ +package backend + +import ( + "context" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/trace" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +// findVectorStoreTrace returns the most recent vector_store trace whose +// model_name matches storeName, or nil if none was recorded. Used by +// the specs below to assert the trace landed without relying on +// ring-buffer ordering across other tests in the suite. +func findVectorStoreTrace(storeName string) *trace.BackendTrace { + traces := trace.GetBackendTraces() + for i := range traces { + bt := &traces[i] + if bt.Type == trace.BackendTraceVectorStore && bt.ModelName == storeName { + return bt + } + } + return nil +} + +var _ = Describe("localVectorStore tracing", func() { + // Pin the trace surface admins read from /api/backend-traces. + // The original failure mode that motivated these specs — the + // local-store backend not installed — was silent on every surface + // except a per-call xlog.Warn. With tracing wired in, the row + // appears next to the embedder/score traces for the same request. + BeforeEach(func() { + trace.ClearBackendTraces() + }) + + It("records a vector_store trace with outcome=backend_load_error when the backend can't be loaded", func() { + // nil ModelLoader → s.backend → StoreBackend → panics on load. + // Use a real-but-empty loader so the failure surfaces as an + // error instead, exercising the load-failure trace path the + // admin would hit when local-store isn't installed. + appCfg := &config.ApplicationConfig{ + EnableTracing: true, + TracingMaxItems: 16, + TracingMaxBodyBytes: 1024, + } + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-test", + } + + // Search must surface the error AND record a trace describing it. + _, _, _, err := s.Search(context.Background(), []float32{0.1, 0.2, 0.3}) + Expect(err).To(HaveOccurred()) + + Eventually(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-test") + }).ShouldNot(BeNil()) + + bt := findVectorStoreTrace("router-cache-test") + Expect(bt.Backend).To(Equal(model.LocalStoreBackend)) + Expect(bt.Data["op"]).To(Equal("search")) + Expect(bt.Data["outcome"]).To(Equal("backend_load_error")) + Expect(bt.Data["vector_dim"]).To(Equal(3)) + // Error is the wrapped "vector store load: …" surfaced to the caller. + Expect(bt.Error).To(ContainSubstring("vector store load")) + }) + + It("does not record a trace when tracing is disabled", func() { + // Opt-out path: appConfig.EnableTracing=false must short-circuit + // before InitBackendTracingIfEnabled, so a workload with tracing + // turned off doesn't pay the channel-send cost per cache call. + appCfg := &config.ApplicationConfig{EnableTracing: false} + s := &localVectorStore{ + loader: model.NewModelLoader(&system.SystemState{}), + appConfig: appCfg, + storeName: "router-cache-disabled", + } + _, _, _, _ = s.Search(context.Background(), []float32{1}) + Consistently(func() *trace.BackendTrace { + return findVectorStoreTrace("router-cache-disabled") + }).Should(BeNil()) + }) +}) diff --git a/core/backend/tokenize.go b/core/backend/tokenize.go index 96618d89cdc2..6b926b1793a1 100644 --- a/core/backend/tokenize.go +++ b/core/backend/tokenize.go @@ -7,9 +7,23 @@ import ( "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/core/trace" "github.com/mudler/LocalAI/pkg/grpc" + pb "github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/model" ) +// tokenizeTokenCount returns the number of tokens in a backend response, +// treating a nil response as zero. The gRPC client returns (nil, err) on +// failure, and the tracing block below runs before that error is returned — +// so the count must be read nil-safely here. Reading resp.Tokens on a nil +// resp previously panicked the whole HTTP handler when tracing was enabled +// (e.g. a transient tokenize failure during router probe-budget sizing). +func tokenizeTokenCount(resp *pb.TokenizationResponse) int { + if resp == nil { + return 0 + } + return len(resp.Tokens) +} + func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) { var inferenceModel grpc.Backend @@ -40,10 +54,7 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model errStr = err.Error() } - tokenCount := 0 - if resp.Tokens != nil { - tokenCount = len(resp.Tokens) - } + tokenCount := tokenizeTokenCount(resp) trace.RecordBackendTrace(trace.BackendTrace{ Timestamp: startTime, @@ -64,8 +75,8 @@ func ModelTokenize(s string, loader *model.ModelLoader, modelConfig config.Model return schema.TokenizeResponse{}, err } - if resp.Tokens == nil { - resp.Tokens = make([]int32, 0) + if resp == nil || resp.Tokens == nil { + return schema.TokenizeResponse{Tokens: make([]int32, 0)}, nil } return schema.TokenizeResponse{ diff --git a/core/backend/tokenize_test.go b/core/backend/tokenize_test.go new file mode 100644 index 000000000000..3b5c8e9fbc6f --- /dev/null +++ b/core/backend/tokenize_test.go @@ -0,0 +1,27 @@ +package backend + +import ( + pb "github.com/mudler/LocalAI/pkg/grpc/proto" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("tokenizeTokenCount", func() { + // Regression: the gRPC client returns (nil, err) when a tokenize call + // fails, and ModelTokenize's tracing block reads the token count before + // the error is returned. Dereferencing a nil response there panicked the + // HTTP handler (nil pointer dereference) — e.g. a transient tokenize + // failure while the router sized its probe-token budget. + It("returns zero for a nil response instead of panicking", func() { + Expect(tokenizeTokenCount(nil)).To(Equal(0)) + }) + + It("returns zero when the response carries no tokens", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{})).To(Equal(0)) + }) + + It("counts the tokens present on the response", func() { + Expect(tokenizeTokenCount(&pb.TokenizationResponse{Tokens: []int32{1, 2, 3}})).To(Equal(3)) + }) +}) diff --git a/core/config/application_config.go b/core/config/application_config.go index dd36b97b90fe..8e8b63cbfa7f 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -56,7 +56,7 @@ type ApplicationConfig struct { // // patterns: // - id: email - // action: route_local # downgrade default mask -> route_local + // action: allow # downgrade default mask -> allow (log only) // - id: ssn // action: block # upgrade default mask -> block // diff --git a/core/config/meta/build.go b/core/config/meta/build.go index 24cfb86b7962..39235b9998dd 100644 --- a/core/config/meta/build.go +++ b/core/config/meta/build.go @@ -93,6 +93,9 @@ func applyOverride(f *FieldMeta, o FieldMetaOverride) { if o.Component != "" { f.Component = o.Component } + if o.Language != "" { + f.Language = o.Language + } if o.Placeholder != "" { f.Placeholder = o.Placeholder } diff --git a/core/config/meta/constants.go b/core/config/meta/constants.go index b15eb53d0d94..9be49fec0eed 100644 --- a/core/config/meta/constants.go +++ b/core/config/meta/constants.go @@ -8,6 +8,7 @@ const ( ProviderModelsTTS = "models:tts" ProviderModelsTranscript = "models:transcript" ProviderModelsVAD = "models:vad" + ProviderModelsScore = "models:score" ) // Static option lists embedded directly in field metadata. diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 54d891106d8e..f429ff3dc673 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -210,6 +210,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Template", Description: "Go template for chat completion requests", Component: "code-editor", + Language: "gotemplate", Order: 40, }, "template.chat_message": { @@ -217,6 +218,7 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Chat Message Template", Description: "Go template for individual chat messages", Component: "code-editor", + Language: "gotemplate", Order: 41, }, "template.completion": { @@ -224,13 +226,22 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Completion Template", Description: "Go template for completion requests", Component: "code-editor", + Language: "gotemplate", Order: 42, }, + "template.function": { + Section: "templates", + Label: "Functions Template", + Description: "Go template applied when tools/functions are present in the request", + Component: "code-editor", + Language: "gotemplate", + Order: 43, + }, "template.use_tokenizer_template": { Section: "templates", Label: "Use Tokenizer Template", Description: "Use the chat template from the model's tokenizer config", - Order: 43, + Order: 44, }, // Router section template — kept in the templates UI section // (rather than the router section under "other") so operators @@ -241,7 +252,8 @@ func DefaultRegistry() map[string]FieldMetaOverride { Label: "Router Classifier System Prompt", Description: "Go text/template (with sprig functions) for the routing system prompt the score classifier feeds to its classifier_model. Executed with `.Policies` ([]{Label, Description}). Empty falls back to the built-in Arch-Router-shaped prompt (route-listing block + JSON output schema). Override when the classifier model was trained on a different schema or you need the routing instructions in a different language. The candidate format scored against the model is fixed at `{\"route\": \"