fix(llm-validation): accept stepfun multimodal models and return 422 on capability mismatch

This commit is contained in:
2026-02-24 16:00:23 +03:00
parent 4fd9d6b6d5
commit 5f6e9c0cc0
5 changed files with 37 additions and 8 deletions

View File

@@ -1239,7 +1239,7 @@ async def _dispatch_intent(
) )
provider = LLMProviderService(db).get_provider(provider_id) provider = LLMProviderService(db).get_provider(provider_id)
provider_model = provider.default_model if provider else "" provider_model = provider.default_model if provider else ""
if not is_multimodal_model(provider_model): if not is_multimodal_model(provider_model, provider.provider_type if provider else None):
raise HTTPException( raise HTTPException(
status_code=422, status_code=422,
detail=( detail=(

View File

@@ -83,9 +83,13 @@ async def create_task(
db_provider = llm_service.get_provider(provider_id) db_provider = llm_service.get_provider(provider_id)
if not db_provider: if not db_provider:
raise ValueError(f"LLM Provider {provider_id} not found") raise ValueError(f"LLM Provider {provider_id} not found")
if request.plugin_id == "llm_dashboard_validation" and not is_multimodal_model(db_provider.default_model): if request.plugin_id == "llm_dashboard_validation" and not is_multimodal_model(
raise ValueError( db_provider.default_model,
"Selected provider model is not multimodal for dashboard validation" db_provider.provider_type,
):
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="Selected provider model is not multimodal for dashboard validation",
) )
finally: finally:
db.close() db.close()

View File

@@ -109,7 +109,7 @@ class DashboardValidationPlugin(PluginBase):
llm_log.debug(f" Base URL: {db_provider.base_url}") llm_log.debug(f" Base URL: {db_provider.base_url}")
llm_log.debug(f" Default Model: {db_provider.default_model}") llm_log.debug(f" Default Model: {db_provider.default_model}")
llm_log.debug(f" Is Active: {db_provider.is_active}") llm_log.debug(f" Is Active: {db_provider.is_active}")
if not is_multimodal_model(db_provider.default_model): if not is_multimodal_model(db_provider.default_model, db_provider.provider_type):
raise ValueError( raise ValueError(
"Dashboard validation requires a multimodal model (image input support)." "Dashboard validation requires a multimodal model (image input support)."
) )

View File

@@ -74,6 +74,7 @@ def test_render_prompt_replaces_known_placeholders():
def test_is_multimodal_model_detects_known_vision_models(): def test_is_multimodal_model_detects_known_vision_models():
assert is_multimodal_model("gpt-4o") is True assert is_multimodal_model("gpt-4o") is True
assert is_multimodal_model("claude-3-5-sonnet") is True assert is_multimodal_model("claude-3-5-sonnet") is True
assert is_multimodal_model("stepfun/step-3.5-flash:free", "openrouter") is True
assert is_multimodal_model("text-only-model") is False assert is_multimodal_model("text-only-model") is False
# [/DEF:test_is_multimodal_model_detects_known_vision_models:Function] # [/DEF:test_is_multimodal_model_detects_known_vision_models:Function]

View File

@@ -9,7 +9,7 @@
from __future__ import annotations from __future__ import annotations
from copy import deepcopy from copy import deepcopy
from typing import Dict, Any from typing import Dict, Any, Optional
# [DEF:DEFAULT_LLM_PROMPTS:Constant] # [DEF:DEFAULT_LLM_PROMPTS:Constant]
@@ -131,10 +131,21 @@ def normalize_llm_settings(llm_settings: Any) -> Dict[str, Any]:
# @PURPOSE: Heuristically determine whether model supports image input required for dashboard validation. # @PURPOSE: Heuristically determine whether model supports image input required for dashboard validation.
# @PRE: model_name may be empty or mixed-case. # @PRE: model_name may be empty or mixed-case.
# @POST: Returns True when model likely supports multimodal input. # @POST: Returns True when model likely supports multimodal input.
def is_multimodal_model(model_name: str) -> bool: def is_multimodal_model(model_name: str, provider_type: Optional[str] = None) -> bool:
token = (model_name or "").strip().lower() token = (model_name or "").strip().lower()
if not token: if not token:
return False return False
provider = (provider_type or "").strip().lower()
text_only_markers = (
"text-only",
"embedding",
"rerank",
"whisper",
"tts",
"transcribe",
)
if any(marker in token for marker in text_only_markers):
return False
multimodal_markers = ( multimodal_markers = (
"gpt-4o", "gpt-4o",
"gpt-4.1", "gpt-4.1",
@@ -143,8 +154,21 @@ def is_multimodal_model(model_name: str) -> bool:
"gemini", "gemini",
"claude-3", "claude-3",
"claude-sonnet-4", "claude-sonnet-4",
"omni",
"multimodal",
"pixtral",
"llava",
"internvl",
"qwen-vl",
"qwen2-vl",
"stepfun/step-3.5",
) )
return any(marker in token for marker in multimodal_markers) if any(marker in token for marker in multimodal_markers):
return True
# OpenRouter model ids are heterogeneous; keep permissive path for known StepFun family.
if provider == "openrouter" and token.startswith("stepfun/step-3.5"):
return True
return False
# [/DEF:is_multimodal_model:Function] # [/DEF:is_multimodal_model:Function]