Похоже работает

2026-02-07 11:26:06 +03:00
parent 7de96c17c4
commit 0f16bab2b8
26 changed files with 62583 additions and 58745 deletions
--- a/backend/src/api/routes/llm.py
+++ b/backend/src/api/routes/llm.py
@@ -143,6 +143,15 @@ async def test_connection(
        raise HTTPException(status_code=404, detail="Provider not found")
    
    api_key = service.get_decrypted_api_key(provider_id)
+    
+    # Check if API key was successfully decrypted
+    if not api_key:
+        logger.error(f"[llm_routes][test_connection] Failed to decrypt API key for provider {provider_id}")
+        raise HTTPException(
+            status_code=500,
+            detail="Failed to decrypt API key. The provider may have been encrypted with a different encryption key. Please update the provider with a new API key."
+        )
+    
    client = LLMClient(
        provider_type=LLMProviderType(db_provider.provider_type),
        api_key=api_key,
@@ -173,6 +182,13 @@ async def test_provider_config(
    from ...plugins.llm_analysis.service import LLMClient
    logger.info(f"[llm_routes][test_provider_config][Action] Testing config for {config.name}")
    
+    # Check if API key is provided
+    if not config.api_key or config.api_key == "********":
+        raise HTTPException(
+            status_code=400,
+            detail="API key is required for testing connection"
+        )
+    
    client = LLMClient(
        provider_type=config.provider_type,
        api_key=config.api_key,
--- a/backend/src/app.py
+++ b/backend/src/app.py
@@ -18,6 +18,7 @@ import asyncio
 import os

 from .dependencies import get_task_manager, get_scheduler_service
+from .core.utils.network import NetworkError
 from .core.logger import logger, belief_scope
 from .api.routes import plugins, tasks, settings, environments, mappings, migration, connections, git, storage, admin, llm
 from .api import auth
@@ -77,13 +78,34 @@ app.add_middleware(
 # @POST: Logs request and response details.
 # @PARAM: request (Request) - The incoming request object.
 # @PARAM: call_next (Callable) - The next middleware or route handler.
+@app.exception_handler(NetworkError)
+async def network_error_handler(request: Request, exc: NetworkError):
+    with belief_scope("network_error_handler"):
+        logger.error(f"Network error: {exc}")
+        return HTTPException(
+            status_code=503,
+            detail="Environment unavailable. Please check if the Superset instance is running."
+        )
+
@app.middleware("http")
 async def log_requests(request: Request, call_next):
-    with belief_scope("log_requests", f"{request.method} {request.url.path}"):
-        logger.info(f"[DEBUG] Incoming request: {request.method} {request.url.path}")
+    # Avoid spamming logs for polling endpoints
+    is_polling = request.url.path.endswith("/api/tasks") and request.method == "GET"
+    
+    if not is_polling:
+        logger.info(f"Incoming request: {request.method} {request.url.path}")
+    
+    try:
        response = await call_next(request)
-        logger.info(f"[DEBUG] Response status: {response.status_code} for {request.url.path}")
+        if not is_polling:
+            logger.info(f"Response status: {response.status_code} for {request.url.path}")
        return response
+    except NetworkError as e:
+        logger.error(f"Network error caught in middleware: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail="Environment unavailable. Please check if the Superset instance is running."
+        )
 # [/DEF:log_requests:Function]

 # Include API routes
--- a/backend/src/core/task_manager/persistence.py
+++ b/backend/src/core/task_manager/persistence.py
@@ -36,6 +36,7 @@ class TaskPersistenceService:
    # @PRE:     isinstance(task, Task)
    # @POST:    Task record created or updated in database.
    # @PARAM:   task (Task) - The task object to persist.
+    # @SIDE_EFFECT: Writes to task_records table in tasks.db
    def persist_task(self, task: Task) -> None:
        with belief_scope("TaskPersistenceService.persist_task", f"task_id={task.id}"):
            session: Session = TasksSessionLocal()
@@ -50,8 +51,19 @@ class TaskPersistenceService:
                record.environment_id = task.params.get("environment_id") or task.params.get("source_env_id")
                record.started_at = task.started_at
                record.finished_at = task.finished_at
-                record.params = task.params
-                record.result = task.result
+                
+                # Ensure params and result are JSON serializable
+                def json_serializable(obj):
+                    if isinstance(obj, dict):
+                        return {k: json_serializable(v) for k, v in obj.items()}
+                    elif isinstance(obj, list):
+                        return [json_serializable(v) for v in obj]
+                    elif isinstance(obj, datetime):
+                        return obj.isoformat()
+                    return obj
+
+                record.params = json_serializable(task.params)
+                record.result = json_serializable(task.result)
                
                # Store logs as JSON, converting datetime to string
                record.logs = []
@@ -59,6 +71,9 @@ class TaskPersistenceService:
                    log_dict = log.dict()
                    if isinstance(log_dict.get('timestamp'), datetime):
                        log_dict['timestamp'] = log_dict['timestamp'].isoformat()
+                    # Also clean up any datetimes in context
+                    if log_dict.get('context'):
+                        log_dict['context'] = json_serializable(log_dict['context'])
                    record.logs.append(log_dict)
                
                # Extract error if failed
--- a/backend/src/core/utils/network.py
+++ b/backend/src/core/utils/network.py
@@ -140,7 +140,16 @@ class APIClient:
            app_logger.info("[authenticate][Enter] Authenticating to %s", self.base_url)
            try:
                login_url = f"{self.base_url}/security/login"
+                # Log the payload keys and values (masking password)
+                masked_auth = {k: ("******" if k == "password" else v) for k, v in self.auth.items()}
+                app_logger.info(f"[authenticate][Debug] Login URL: {login_url}")
+                app_logger.info(f"[authenticate][Debug] Auth payload: {masked_auth}")
+                
                response = self.session.post(login_url, json=self.auth, timeout=self.request_settings["timeout"])
+                
+                if response.status_code != 200:
+                    app_logger.error(f"[authenticate][Error] Status: {response.status_code}, Response: {response.text}")
+                
                response.raise_for_status()
                access_token = response.json()["access_token"]
                
@@ -153,6 +162,9 @@ class APIClient:
                app_logger.info("[authenticate][Exit] Authenticated successfully.")
                return self._tokens
            except requests.exceptions.HTTPError as e:
+                status_code = e.response.status_code if e.response is not None else None
+                if status_code in [502, 503, 504]:
+                    raise NetworkError(f"Environment unavailable during authentication (Status {status_code})", status_code=status_code) from e
                raise AuthenticationError(f"Authentication failed: {e}") from e
            except (requests.exceptions.RequestException, KeyError) as e:
                raise NetworkError(f"Network or parsing error during authentication: {e}") from e
@@ -209,6 +221,8 @@ class APIClient:
    def _handle_http_error(self, e: requests.exceptions.HTTPError, endpoint: str):
        with belief_scope("_handle_http_error"):
            status_code = e.response.status_code
+        if status_code == 502 or status_code == 503 or status_code == 504:
+            raise NetworkError(f"Environment unavailable (Status {status_code})", status_code=status_code) from e
        if status_code == 404: raise DashboardNotFoundError(endpoint) from e
        if status_code == 403: raise PermissionDeniedError() from e
        if status_code == 401: raise AuthenticationError() from e
--- a/backend/src/dependencies.py
+++ b/backend/src/dependencies.py
@@ -35,8 +35,7 @@ init_db()
 # @RETURN: ConfigManager - The shared config manager instance.
 def get_config_manager() -> ConfigManager:
    """Dependency injector for the ConfigManager."""
-    with belief_scope("get_config_manager"):
-        return config_manager
+    return config_manager
 # [/DEF:get_config_manager:Function]

 plugin_dir = Path(__file__).parent / "plugins"
@@ -58,8 +57,7 @@ logger.info("SchedulerService initialized")
 # @RETURN: PluginLoader - The shared plugin loader instance.
 def get_plugin_loader() -> PluginLoader:
    """Dependency injector for the PluginLoader."""
-    with belief_scope("get_plugin_loader"):
-        return plugin_loader
+    return plugin_loader
 # [/DEF:get_plugin_loader:Function]

 # [DEF:get_task_manager:Function]
@@ -69,8 +67,7 @@ def get_plugin_loader() -> PluginLoader:
 # @RETURN: TaskManager - The shared task manager instance.
 def get_task_manager() -> TaskManager:
    """Dependency injector for the TaskManager."""
-    with belief_scope("get_task_manager"):
-        return task_manager
+    return task_manager
 # [/DEF:get_task_manager:Function]

 # [DEF:get_scheduler_service:Function]
@@ -80,8 +77,7 @@ def get_task_manager() -> TaskManager:
 # @RETURN: SchedulerService - The shared scheduler service instance.
 def get_scheduler_service() -> SchedulerService:
    """Dependency injector for the SchedulerService."""
-    with belief_scope("get_scheduler_service"):
-        return scheduler_service
+    return scheduler_service
 # [/DEF:get_scheduler_service:Function]

 # [DEF:oauth2_scheme:Variable]
@@ -98,25 +94,24 @@ oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/auth/login")
 # @PARAM:   db (Session) - Auth database session.
 # @RETURN:  User - The authenticated user.
 def get_current_user(token: str = Depends(oauth2_scheme), db = Depends(get_auth_db)):
-    with belief_scope("get_current_user"):
-        credentials_exception = HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Could not validate credentials",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-        try:
-            payload = decode_token(token)
-            username: str = payload.get("sub")
-            if username is None:
-                raise credentials_exception
-        except JWTError:
+    credentials_exception = HTTPException(
+        status_code=status.HTTP_401_UNAUTHORIZED,
+        detail="Could not validate credentials",
+        headers={"WWW-Authenticate": "Bearer"},
+    )
+    try:
+        payload = decode_token(token)
+        username: str = payload.get("sub")
+        if username is None:
            raise credentials_exception
-            
-        repo = AuthRepository(db)
-        user = repo.get_user_by_username(username)
-        if user is None:
-            raise credentials_exception
-        return user
+    except JWTError:
+        raise credentials_exception
+        
+    repo = AuthRepository(db)
+    user = repo.get_user_by_username(username)
+    if user is None:
+        raise credentials_exception
+    return user
 # [/DEF:get_current_user:Function]

 # [DEF:has_permission:Function]
@@ -129,24 +124,23 @@ def get_current_user(token: str = Depends(oauth2_scheme), db = Depends(get_auth_
 # @RETURN:  User - The authenticated user if permission granted.
 def has_permission(resource: str, action: str):
    def permission_checker(current_user: User = Depends(get_current_user)):
-        with belief_scope("has_permission", f"{resource}:{action}"):
-            # Union of all permissions across all roles
-            for role in current_user.roles:
-                for perm in role.permissions:
-                    if perm.resource == resource and perm.action == action:
-                        return current_user
+        # Union of all permissions across all roles
+        for role in current_user.roles:
+            for perm in role.permissions:
+                if perm.resource == resource and perm.action == action:
+                    return current_user
+        
+        # Special case for Admin role (full access)
+        if any(role.name == "Admin" for role in current_user.roles):
+            return current_user
+        
+        from .core.auth.logger import log_security_event
+        log_security_event("PERMISSION_DENIED", current_user.username, {"resource": resource, "action": action})
            
-            # Special case for Admin role (full access)
-            if any(role.name == "Admin" for role in current_user.roles):
-                return current_user
-            
-            from .core.auth.logger import log_security_event
-            log_security_event("PERMISSION_DENIED", current_user.username, {"resource": resource, "action": action})
-                
-            raise HTTPException(
-                status_code=status.HTTP_403_FORBIDDEN,
-                detail=f"Permission denied for {resource}:{action}"
-            )
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail=f"Permission denied for {resource}:{action}"
+        )
    return permission_checker
 # [/DEF:has_permission:Function]

--- a/backend/src/plugins/llm_analysis/init.py
+++ b/backend/src/plugins/llm_analysis/init.py
@@ -1,6 +1,7 @@
 # [DEF:backend/src/plugins/llm_analysis/__init__.py:Module]
 # @TIER: TRIVIAL
 # @PURPOSE: Initialize the LLM Analysis plugin package.
+# @LAYER: Domain

 """
 LLM Analysis Plugin for automated dashboard validation and dataset documentation.
@@ -8,4 +9,4 @@ LLM Analysis Plugin for automated dashboard validation and dataset documentation

 from .plugin import DashboardValidationPlugin, DocumentationPlugin

-# [/DEF:backend/src/plugins/llm_analysis/__init__.py]
+# [/DEF:backend/src/plugins/llm_analysis/__init__.py:Module]
--- a/backend/src/plugins/llm_analysis/models.py
+++ b/backend/src/plugins/llm_analysis/models.py
@@ -24,7 +24,7 @@ class LLMProviderConfig(BaseModel):
    provider_type: LLMProviderType
    name: str
    base_url: str
-    api_key: str
+    api_key: Optional[str] = None
    default_model: str
    is_active: bool = True
 # [/DEF:LLMProviderConfig:Class]
@@ -58,4 +58,4 @@ class ValidationResult(BaseModel):
    raw_response: Optional[str] = None
 # [/DEF:ValidationResult:Class]

-# [/DEF:backend/src/plugins/llm_analysis/models.py]
+# [/DEF:backend/src/plugins/llm_analysis/models.py:Module]
--- a/backend/src/plugins/llm_analysis/plugin.py
+++ b/backend/src/plugins/llm_analysis/plugin.py
@@ -1,9 +1,9 @@
-# [DEF:backend.src.plugins.llm_analysis.plugin:Module]
+# [DEF:backend/src/plugins/llm_analysis/plugin.py:Module]
 # @TIER: STANDARD
 # @SEMANTICS: plugin, llm, analysis, documentation
 # @PURPOSE: Implements DashboardValidationPlugin and DocumentationPlugin.
 # @LAYER: Domain
-# @RELATION: INHERITS_FROM -> backend.src.core.plugin_base.PluginBase
+# @RELATION: INHERITS -> backend.src.core.plugin_base.PluginBase
 # @RELATION: CALLS -> backend.src.plugins.llm_analysis.service.ScreenshotService
 # @RELATION: CALLS -> backend.src.plugins.llm_analysis.service.LLMClient
 # @RELATION: CALLS -> backend.src.services.llm_provider.LLMProviderService
@@ -12,6 +12,7 @@
 from typing import Dict, Any, Optional, List
 import os
 import json
+import logging
 from datetime import datetime, timedelta
 from ...core.plugin_base import PluginBase
 from ...core.logger import belief_scope, logger
@@ -54,6 +55,11 @@ class DashboardValidationPlugin(PluginBase):
            "required": ["dashboard_id", "environment_id", "provider_id"]
        }

+    # [DEF:DashboardValidationPlugin.execute:Function]
+    # @PURPOSE: Executes the dashboard validation task.
+    # @PRE: params contains dashboard_id, environment_id, and provider_id.
+    # @POST: Returns a dictionary with validation results and persists them to the database.
+    # @SIDE_EFFECT: Captures a screenshot, calls LLM API, and writes to the database.
    async def execute(self, params: Dict[str, Any]):
        with belief_scope("execute", f"plugin_id={self.id}"):
            logger.info(f"Executing {self.name} with params: {params}")
@@ -88,12 +94,35 @@ class DashboardValidationPlugin(PluginBase):
                if not db_provider:
                    raise ValueError(f"LLM Provider {provider_id} not found")
                
+                logger.info(f"[DashboardValidationPlugin.execute] Retrieved provider config:")
+                logger.info(f"[DashboardValidationPlugin.execute]   Provider ID: {db_provider.id}")
+                logger.info(f"[DashboardValidationPlugin.execute]   Provider Name: {db_provider.name}")
+                logger.info(f"[DashboardValidationPlugin.execute]   Provider Type: {db_provider.provider_type}")
+                logger.info(f"[DashboardValidationPlugin.execute]   Base URL: {db_provider.base_url}")
+                logger.info(f"[DashboardValidationPlugin.execute]   Default Model: {db_provider.default_model}")
+                logger.info(f"[DashboardValidationPlugin.execute]   Is Active: {db_provider.is_active}")
+                
                api_key = llm_service.get_decrypted_api_key(provider_id)
+                logger.info(f"[DashboardValidationPlugin.execute]   API Key decrypted (first 8 chars): {api_key[:8] if api_key and len(api_key) > 8 else 'EMPTY_OR_NONE'}...")
+                logger.info(f"[DashboardValidationPlugin.execute]   API Key Length: {len(api_key) if api_key else 0}")
+                
+                # Check if API key was successfully decrypted
+                if not api_key:
+                    raise ValueError(
+                        f"Failed to decrypt API key for provider {provider_id}. "
+                        f"The provider may have been encrypted with a different encryption key. "
+                        f"Please update the provider with a new API key through the UI."
+                    )

                # 3. Capture Screenshot
                screenshot_service = ScreenshotService(env)
-                os.makedirs("ss-tools-storage/screenshots", exist_ok=True)
-                screenshot_path = f"ss-tools-storage/screenshots/{dashboard_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+                
+                storage_root = config_mgr.get_config().settings.storage.root_path
+                screenshots_dir = os.path.join(storage_root, "screenshots")
+                os.makedirs(screenshots_dir, exist_ok=True)
+                
+                filename = f"{dashboard_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+                screenshot_path = os.path.join(screenshots_dir, filename)
                
                await screenshot_service.capture_dashboard(dashboard_id, screenshot_path)

@@ -109,8 +138,8 @@ class DashboardValidationPlugin(PluginBase):
                    # Note: We filter by dashboard_id matching the object
                    query_params = {
                        "filters": [
-                            {"col": "dashboard_id", "op": "eq", "value": dashboard_id},
-                            {"col": "dttm", "op": "gt", "value": start_time}
+                            {"col": "dashboard_id", "opr": "eq", "value": dashboard_id},
+                            {"col": "dttm", "opr": "gt", "value": start_time}
                        ],
                        "order_column": "dttm",
                        "order_direction": "desc",
@@ -149,11 +178,11 @@ class DashboardValidationPlugin(PluginBase):
                analysis = await llm_client.analyze_dashboard(screenshot_path, logs)
                
                # Log analysis summary to task logs for better visibility
-                logger.info(f"[ANALYSIS_SUMMARY] Status: {analysis['status']}")
-                logger.info(f"[ANALYSIS_SUMMARY] Summary: {analysis['summary']}")
+                task_log("INFO", f"[ANALYSIS_SUMMARY] Status: {analysis['status']}")
+                task_log("INFO", f"[ANALYSIS_SUMMARY] Summary: {analysis['summary']}")
                if analysis.get("issues"):
                    for i, issue in enumerate(analysis["issues"]):
-                        logger.info(f"[ANALYSIS_ISSUE][{i+1}] {issue.get('severity')}: {issue.get('message')} (Location: {issue.get('location', 'N/A')})")
+                        task_log("INFO", f"[ANALYSIS_ISSUE][{i+1}] {issue.get('severity')}: {issue.get('message')} (Location: {issue.get('location', 'N/A')})")

                # 6. Persist Result
                validation_result = ValidationResult(
@@ -178,7 +207,7 @@ class DashboardValidationPlugin(PluginBase):

                # 7. Notification on failure (US1 / FR-015)
                if validation_result.status == ValidationStatus.FAIL:
-                    logger.warning(f"Dashboard {dashboard_id} validation FAILED. Summary: {validation_result.summary}")
+                    task_log("WARNING", f"Dashboard {dashboard_id} validation FAILED. Summary: {validation_result.summary}")
                    # Placeholder for Email/Pulse notification dispatch
                    # In a real implementation, we would call a NotificationService here
                    # with a payload containing the summary and a link to the report.
@@ -190,6 +219,7 @@ class DashboardValidationPlugin(PluginBase):

            finally:
                db.close()
+    # [/DEF:DashboardValidationPlugin.execute:Function]
 # [/DEF:DashboardValidationPlugin:Class]

 # [DEF:DocumentationPlugin:Class]
@@ -223,12 +253,7 @@ class DocumentationPlugin(PluginBase):
            "required": ["dataset_id", "environment_id", "provider_id"]
        }

-    # [DEF:execute:Function]
-    # @PURPOSE: Executes the dashboard validation task.
-    # @PRE: params contains dashboard_id, environment_id, and provider_id.
-    # @POST: Returns a dictionary with validation results and persists them to the database.
-    # @SIDE_EFFECT: Captures a screenshot, calls LLM API, and writes to the database.
-    # [DEF:execute:Function]
+    # [DEF:DocumentationPlugin.execute:Function]
    # @PURPOSE: Executes the dataset documentation task.
    # @PRE: params contains dataset_id, environment_id, and provider_id.
    # @POST: Returns generated documentation and updates the dataset in Superset.
@@ -256,7 +281,25 @@ class DocumentationPlugin(PluginBase):
                if not db_provider:
                    raise ValueError(f"LLM Provider {provider_id} not found")
                
+                logger.info(f"[DocumentationPlugin.execute] Retrieved provider config:")
+                logger.info(f"[DocumentationPlugin.execute]   Provider ID: {db_provider.id}")
+                logger.info(f"[DocumentationPlugin.execute]   Provider Name: {db_provider.name}")
+                logger.info(f"[DocumentationPlugin.execute]   Provider Type: {db_provider.provider_type}")
+                logger.info(f"[DocumentationPlugin.execute]   Base URL: {db_provider.base_url}")
+                logger.info(f"[DocumentationPlugin.execute]   Default Model: {db_provider.default_model}")
+                logger.info(f"[DocumentationPlugin.execute]   Is Active: {db_provider.is_active}")
+                
                api_key = llm_service.get_decrypted_api_key(provider_id)
+                logger.info(f"[DocumentationPlugin.execute]   API Key decrypted (first 8 chars): {api_key[:8] if api_key and len(api_key) > 8 else 'EMPTY_OR_NONE'}...")
+                logger.info(f"[DocumentationPlugin.execute]   API Key Length: {len(api_key) if api_key else 0}")
+                
+                # Check if API key was successfully decrypted
+                if not api_key:
+                    raise ValueError(
+                        f"Failed to decrypt API key for provider {provider_id}. "
+                        f"The provider may have been encrypted with a different encryption key. "
+                        f"Please update the provider with a new API key through the UI."
+                    )

                # 3. Fetch Metadata (US2 / T024)
                from ...core.superset_client import SupersetClient
@@ -328,6 +371,7 @@ class DocumentationPlugin(PluginBase):

            finally:
                db.close()
+    # [/DEF:DocumentationPlugin.execute:Function]
 # [/DEF:DocumentationPlugin:Class]

-# [/DEF:backend.src.plugins.llm_analysis.plugin:Module]
+# [/DEF:backend/src/plugins/llm_analysis/plugin.py:Module]
--- a/backend/src/plugins/llm_analysis/scheduler.py
+++ b/backend/src/plugins/llm_analysis/scheduler.py
@@ -14,6 +14,7 @@ from ...core.logger import belief_scope, logger
 # @PARAM: dashboard_id (str) - ID of the dashboard to validate.
 # @PARAM: cron_expression (str) - Standard cron expression for scheduling.
 # @PARAM: params (Dict[str, Any]) - Task parameters (environment_id, provider_id).
+# @SIDE_EFFECT: Adds a job to the scheduler service.
 def schedule_dashboard_validation(dashboard_id: str, cron_expression: str, params: Dict[str, Any]):
    with belief_scope("schedule_dashboard_validation", f"dashboard_id={dashboard_id}"):
        scheduler = get_scheduler_service()
@@ -39,6 +40,10 @@ def schedule_dashboard_validation(dashboard_id: str, cron_expression: str, param
        )
        logger.info(f"Scheduled validation for dashboard {dashboard_id} with cron {cron_expression}")

+# [DEF:_parse_cron:Function]
+# @PURPOSE: Basic cron parser placeholder.
+# @PARAM: cron (str) - Cron expression.
+# @RETURN: Dict[str, str] - Parsed cron parts.
 def _parse_cron(cron: str) -> Dict[str, str]:
    # Basic cron parser placeholder
    parts = cron.split()
@@ -51,6 +56,5 @@ def _parse_cron(cron: str) -> Dict[str, str]:
        "month": parts[3],
        "day_of_week": parts[4]
    }
-# [/DEF:schedule_dashboard_validation:Function]

-# [/DEF:backend/src/plugins/llm_analysis/scheduler.py]
+# [/DEF:backend/src/plugins/llm_analysis/scheduler.py:Module]
--- a/backend/src/plugins/llm_analysis/service.py
+++ b/backend/src/plugins/llm_analysis/service.py
@@ -1,4 +1,4 @@
-# [DEF:backend.src.plugins.llm_analysis.service:Module]
+# [DEF:backend/src/plugins/llm_analysis/service.py:Module]
 # @TIER: STANDARD
 # @SEMANTICS: service, llm, screenshot, playwright, openai
 # @PURPOSE: Services for LLM interaction and dashboard screenshots.
@@ -6,12 +6,17 @@
 # @RELATION: DEPENDS_ON -> playwright
 # @RELATION: DEPENDS_ON -> openai
 # @RELATION: DEPENDS_ON -> tenacity
+# @INVARIANT: Screenshots must be 1920px width and capture full page height.

 import asyncio
+import base64
+import json
+import io
 from typing import List, Optional, Dict, Any
+from PIL import Image
 from playwright.async_api import async_playwright
-from openai import AsyncOpenAI, RateLimitError
-from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+from openai import AsyncOpenAI, RateLimitError, AuthenticationError as OpenAIAuthenticationError
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception
 from .models import LLMProviderType, ValidationResult, ValidationStatus, DetectedIssue
 from ...core.logger import belief_scope, logger
 from ...core.config_models import Environment
@@ -19,142 +24,559 @@ from ...core.config_models import Environment
 # [DEF:ScreenshotService:Class]
 # @PURPOSE: Handles capturing screenshots of Superset dashboards.
 class ScreenshotService:
+    # [DEF:ScreenshotService.__init__:Function]
+    # @PURPOSE: Initializes the ScreenshotService with environment configuration.
    # @PRE: env is a valid Environment object.
    def __init__(self, env: Environment):
        self.env = env
+    # [/DEF:ScreenshotService.__init__:Function]

-    # [DEF:capture_dashboard:Function]
-    # @PURPOSE: Captures a screenshot of a dashboard using Playwright.
-    # @PARAM: dashboard_id (str) - ID of the dashboard.
-    # @PARAM: output_path (str) - Path to save the screenshot.
-    # @RETURN: bool - True if successful.
+    # [DEF:ScreenshotService.capture_dashboard:Function]
+    # @PURPOSE: Captures a full-page screenshot of a dashboard using Playwright and CDP.
+    # @PRE: dashboard_id is a valid string, output_path is a writable path.
+    # @POST: Returns True if screenshot is saved successfully.
+    # @SIDE_EFFECT: Launches a browser, performs UI login, switches tabs, and writes a PNG file.
+    # @UX_STATE: [Navigating] -> Loading dashboard UI
+    # @UX_STATE: [TabSwitching] -> Iterating through dashboard tabs to trigger lazy loading
+    # @UX_STATE: [CalculatingHeight] -> Determining dashboard dimensions
+    # @UX_STATE: [Capturing] -> Executing CDP screenshot
    async def capture_dashboard(self, dashboard_id: str, output_path: str) -> bool:
        with belief_scope("capture_dashboard", f"dashboard_id={dashboard_id}"):
            logger.info(f"Capturing screenshot for dashboard {dashboard_id}")
            async with async_playwright() as p:
-                browser = await p.chromium.launch(headless=True)
-                context = await browser.new_context(viewport={'width': 1280, 'height': 720})
-                page = await context.new_page()
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=[
+                        "--disable-blink-features=AutomationControlled",
+                        "--disable-infobars",
+                        "--no-sandbox"
+                    ]
+                )
+                # Set a realistic user agent to avoid 403 Forbidden from OpenResty/WAF
+                user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+                # Construct base UI URL from environment (strip /api/v1 suffix)
+                base_ui_url = self.env.url.rstrip("/")
+                if base_ui_url.endswith("/api/v1"):
+                    base_ui_url = base_ui_url[:-len("/api/v1")]
                
-                # 1. Authenticate via API to get tokens
-                from ...core.superset_client import SupersetClient
-                client = SupersetClient(self.env)
-                try:
-                    tokens = client.authenticate()
-                    access_token = tokens.get("access_token")
-                    
-                    # Set JWT in localStorage if possible, or use as cookie
-                    # Superset UI uses session cookies, but we can try to set the Authorization header
-                    # or inject the token into the session.
-                    # For now, we'll use the token to set a cookie if we can determine the name,
-                    # but the most reliable way for Playwright is often still the UI login
-                    # UNLESS we use the API to set a session cookie.
-                    logger.info("API Authentication successful")
-                except Exception as e:
-                    logger.warning(f"API Authentication failed: {e}. Falling back to UI login.")
-
-                # 2. Navigate to dashboard
-                dashboard_url = f"{self.env.url}/superset/dashboard/{dashboard_id}/"
-                logger.info(f"Navigating to {dashboard_url}")
-                
-                # We still go to the URL first
-                await page.goto(dashboard_url)
-                await page.wait_for_load_state("networkidle")
-
-                # 3. Check if we are redirected to login
-                if "/login" in page.url:
-                    logger.info(f"Redirected to login: {page.url}. Filling credentials from Environment.")
-                    
-                    # More exhaustive list of selectors for various Superset versions/themes
-                    selectors = {
-                        "username": ['input[name="username"]', 'input#username', 'input[placeholder*="Username"]'],
-                        "password": ['input[name="password"]', 'input#password', 'input[placeholder*="Password"]'],
-                        "submit": ['button[type="submit"]', 'button#submit', '.btn-primary']
+                # Create browser context with realistic headers
+                context = await browser.new_context(
+                    viewport={'width': 1280, 'height': 720},
+                    user_agent=user_agent,
+                    extra_http_headers={
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                        "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
+                        "Upgrade-Insecure-Requests": "1",
+                        "Sec-Fetch-Dest": "document",
+                        "Sec-Fetch-Mode": "navigate",
+                        "Sec-Fetch-Site": "none",
+                        "Sec-Fetch-User": "?1"
                    }
-                    
-                    try:
-                        # Find and fill username
-                        u_selector = None
-                        for s in selectors["username"]:
-                            if await page.locator(s).count() > 0:
-                                u_selector = s
-                                break
-                        
-                        if not u_selector:
-                            raise RuntimeError("Could not find username input field")
-                            
-                        await page.fill(u_selector, self.env.username)
-                        
-                        # Find and fill password
-                        p_selector = None
-                        for s in selectors["password"]:
-                            if await page.locator(s).count() > 0:
-                                p_selector = s
-                                break
-                        
-                        if not p_selector:
-                            raise RuntimeError("Could not find password input field")
-                            
-                        await page.fill(p_selector, self.env.password)
-                        
-                        # Click submit
-                        s_selector = selectors["submit"][0]
-                        for s in selectors["submit"]:
-                            if await page.locator(s).count() > 0:
-                                s_selector = s
-                                break
-                        
-                        await page.click(s_selector)
-                        await page.wait_for_load_state("networkidle")
-                        
-                        # Re-verify we are at the dashboard
-                        if "/login" in page.url:
-                            # Check for error messages on page
-                            error_msg = await page.locator(".alert-danger, .error-message").text_content() if await page.locator(".alert-danger, .error-message").count() > 0 else "Unknown error"
-                            raise RuntimeError(f"Login failed after submission: {error_msg}")
+                )
+                logger.info("Browser context created successfully")

-                        if "/superset/dashboard" not in page.url:
-                            logger.info(f"Redirecting back to dashboard after login: {dashboard_url}")
-                            await page.goto(dashboard_url)
-                            await page.wait_for_load_state("networkidle")
-                            
-                    except Exception as e:
-                        page_title = await page.title()
-                        logger.error(f"UI Login failed. Page title: {page_title}, URL: {page.url}, Error: {str(e)}")
+                page = await context.new_page()
+                # Bypass navigator.webdriver detection
+                await page.add_init_script("delete Object.getPrototypeOf(navigator).webdriver")
+
+                # 1. Navigate to login page and authenticate
+                login_url = f"{base_ui_url.rstrip('/')}/login/"
+                logger.info(f"[DEBUG] Navigating to login page: {login_url}")
+                
+                response = await page.goto(login_url, wait_until="networkidle", timeout=60000)
+                if response:
+                    logger.info(f"[DEBUG] Login page response status: {response.status}")
+                
+                # Wait for login form to be ready
+                await page.wait_for_load_state("domcontentloaded")
+                
+                # More exhaustive list of selectors for various Superset versions/themes
+                selectors = {
+                    "username": ['input[name="username"]', 'input#username', 'input[placeholder*="Username"]', 'input[type="text"]'],
+                    "password": ['input[name="password"]', 'input#password', 'input[placeholder*="Password"]', 'input[type="password"]'],
+                    "submit": ['button[type="submit"]', 'button#submit', '.btn-primary', 'input[type="submit"]']
+                }
+                logger.info(f"[DEBUG] Attempting to find login form elements...")
+                
+                try:
+                    # Find and fill username
+                    u_selector = None
+                    for s in selectors["username"]:
+                        count = await page.locator(s).count()
+                        logger.info(f"[DEBUG] Selector '{s}': {count} elements found")
+                        if count > 0:
+                            u_selector = s
+                            break
+                    
+                    if not u_selector:
+                        # Log all input fields on the page for debugging
+                        all_inputs = await page.locator('input').all()
+                        logger.info(f"[DEBUG] Found {len(all_inputs)} input fields on page")
+                        for i, inp in enumerate(all_inputs[:5]):  # Log first 5
+                            inp_type = await inp.get_attribute('type')
+                            inp_name = await inp.get_attribute('name')
+                            inp_id = await inp.get_attribute('id')
+                            logger.info(f"[DEBUG] Input {i}: type={inp_type}, name={inp_name}, id={inp_id}")
+                        raise RuntimeError("Could not find username input field on login page")
+                    
+                    logger.info(f"[DEBUG] Filling username field with selector: {u_selector}")
+                    await page.fill(u_selector, self.env.username)
+                    
+                    # Find and fill password
+                    p_selector = None
+                    for s in selectors["password"]:
+                        if await page.locator(s).count() > 0:
+                            p_selector = s
+                            break
+                    
+                    if not p_selector:
+                        raise RuntimeError("Could not find password input field on login page")
+                    
+                    logger.info(f"[DEBUG] Filling password field with selector: {p_selector}")
+                    await page.fill(p_selector, self.env.password)
+                    
+                    # Click submit
+                    s_selector = selectors["submit"][0]
+                    for s in selectors["submit"]:
+                        if await page.locator(s).count() > 0:
+                            s_selector = s
+                            break
+                    
+                    logger.info(f"[DEBUG] Clicking submit button with selector: {s_selector}")
+                    await page.click(s_selector)
+                    
+                    # Wait for navigation after login
+                    await page.wait_for_load_state("networkidle", timeout=30000)
+                    
+                    # Check if login was successful
+                    if "/login" in page.url:
+                        # Check for error messages on page
+                        error_msg = await page.locator(".alert-danger, .error-message").text_content() if await page.locator(".alert-danger, .error-message").count() > 0 else "Unknown error"
+                        logger.error(f"[DEBUG] Login failed. Still on login page. Error: {error_msg}")
                        debug_path = output_path.replace(".png", "_debug_failed_login.png")
                        await page.screenshot(path=debug_path)
-                        raise RuntimeError(f"Login failed: {str(e)}. Debug screenshot saved to {debug_path}")
-                # Wait a bit more for charts to render
-                await asyncio.sleep(5)
+                        raise RuntimeError(f"Login failed: {error_msg}. Debug screenshot saved to {debug_path}")
+                    
+                    logger.info(f"[DEBUG] Login successful. Current URL: {page.url}")
+                    
+                    # Check cookies after successful login
+                    page_cookies = await context.cookies()
+                    logger.info(f"[DEBUG] Cookies after login: {len(page_cookies)}")
+                    for c in page_cookies:
+                        logger.info(f"[DEBUG] Cookie: name={c['name']}, domain={c['domain']}, value={c.get('value', '')[:20]}...")
+                    
+                except Exception as e:
+                    page_title = await page.title()
+                    logger.error(f"UI Login failed. Page title: {page_title}, URL: {page.url}, Error: {str(e)}")
+                    debug_path = output_path.replace(".png", "_debug_failed_login.png")
+                    await page.screenshot(path=debug_path)
+                    raise RuntimeError(f"Login failed: {str(e)}. Debug screenshot saved to {debug_path}")
+
+                # 2. Navigate to dashboard
+                # @UX_STATE: [Navigating] -> Loading dashboard UI
+                dashboard_url = f"{base_ui_url.rstrip('/')}/superset/dashboard/{dashboard_id}/?standalone=true"
+                
+                if base_ui_url.startswith("https://") and dashboard_url.startswith("http://"):
+                    dashboard_url = dashboard_url.replace("http://", "https://")
+
+                logger.info(f"[DEBUG] Navigating to dashboard: {dashboard_url}")
+                
+                # Use networkidle to ensure all initial assets are loaded
+                response = await page.goto(dashboard_url, wait_until="networkidle", timeout=60000)
+                
+                if response:
+                    logger.info(f"[DEBUG] Dashboard navigation response status: {response.status}, URL: {response.url}")
+                
+                try:
+                    # Wait for the dashboard grid to be present
+                    await page.wait_for_selector('.dashboard-component, .dashboard-header, [data-test="dashboard-grid"]', timeout=30000)
+                    logger.info(f"[DEBUG] Dashboard container loaded")
+                    
+                    # Wait for charts to finish loading (Superset uses loading spinners/skeletons)
+                    # We wait until loading indicators disappear or a timeout occurs
+                    try:
+                        # Wait for loading indicators to disappear
+                        await page.wait_for_selector('.loading, .ant-skeleton, .spinner', state="hidden", timeout=60000)
+                        logger.info(f"[DEBUG] Loading indicators hidden")
+                    except:
+                        logger.warning(f"[DEBUG] Timeout waiting for loading indicators to hide")
+
+                    # Wait for charts to actually render their content (e.g., ECharts, NVD3)
+                    # We look for common chart containers that should have content
+                    try:
+                        await page.wait_for_selector('.chart-container canvas, .slice_container svg, .superset-chart-canvas, .grid-content .chart-container', timeout=60000)
+                        logger.info(f"[DEBUG] Chart content detected")
+                    except:
+                        logger.warning(f"[DEBUG] Timeout waiting for chart content")
+
+                    # Additional check: wait for all chart containers to have non-empty content
+                    logger.info(f"[DEBUG] Waiting for all charts to have rendered content...")
+                    await page.wait_for_function("""() => {
+                        const charts = document.querySelectorAll('.chart-container, .slice_container');
+                        if (charts.length === 0) return true; // No charts to wait for
+                        
+                        // Check if all charts have rendered content (canvas, svg, or non-empty div)
+                        return Array.from(charts).every(chart => {
+                            const hasCanvas = chart.querySelector('canvas') !== null;
+                            const hasSvg = chart.querySelector('svg') !== null;
+                            const hasContent = chart.innerText.trim().length > 0 || chart.children.length > 0;
+                            return hasCanvas || hasSvg || hasContent;
+                        });
+                    }""", timeout=60000)
+                    logger.info(f"[DEBUG] All charts have rendered content")
+
+                    # Scroll to bottom and back to top to trigger lazy loading of all charts
+                    logger.info(f"[DEBUG] Scrolling to trigger lazy loading...")
+                    await page.evaluate("""async () => {
+                        const delay = ms => new Promise(resolve => setTimeout(resolve, ms));
+                        for (let i = 0; i < document.body.scrollHeight; i += 500) {
+                            window.scrollTo(0, i);
+                            await delay(100);
+                        }
+                        window.scrollTo(0, 0);
+                        await delay(500);
+                    }""")
+
+                except Exception as e:
+                    logger.warning(f"[DEBUG] Dashboard content wait failed: {e}, proceeding anyway after delay")
+                
+                # Final stabilization delay - increased for complex dashboards
+                logger.info(f"[DEBUG] Final stabilization delay...")
+                await asyncio.sleep(15)
+
+                # Logic to handle tabs and full-page capture
+                try:
+                    # 1. Handle Tabs (Recursive switching)
+                    # @UX_STATE: [TabSwitching] -> Iterating through dashboard tabs to trigger lazy loading
+                    processed_tabs = set()
+
+                    async def switch_tabs(depth=0):
+                        if depth > 3: return # Limit recursion depth
+                        
+                        tab_selectors = [
+                            '.ant-tabs-nav-list .ant-tabs-tab',
+                            '.dashboard-component-tabs .ant-tabs-tab',
+                            '[data-test="dashboard-component-tabs"] .ant-tabs-tab'
+                        ]
+                        
+                        found_tabs = []
+                        for selector in tab_selectors:
+                            found_tabs = await page.locator(selector).all()
+                            if found_tabs: break
+                            
+                        if found_tabs:
+                            logger.info(f"[DEBUG][TabSwitching] Found {len(found_tabs)} tabs at depth {depth}")
+                            for i, tab in enumerate(found_tabs):
+                                try:
+                                    tab_text = (await tab.inner_text()).strip()
+                                    tab_id = f"{depth}_{i}_{tab_text}"
+                                    
+                                    if tab_id in processed_tabs:
+                                        continue
+                                    
+                                    if await tab.is_visible():
+                                        logger.info(f"[DEBUG][TabSwitching] Switching to tab: {tab_text}")
+                                        processed_tabs.add(tab_id)
+                                        
+                                        is_active = "ant-tabs-tab-active" in (await tab.get_attribute("class") or "")
+                                        if not is_active:
+                                            await tab.click()
+                                            await asyncio.sleep(2) # Wait for content to render
+                                        
+                                        await switch_tabs(depth + 1)
+                                except Exception as tab_e:
+                                    logger.warning(f"[DEBUG][TabSwitching] Failed to process tab {i}: {tab_e}")
+                            
+                            try:
+                                first_tab = found_tabs[0]
+                                if "ant-tabs-tab-active" not in (await first_tab.get_attribute("class") or ""):
+                                    await first_tab.click()
+                                    await asyncio.sleep(1)
+                            except: pass
+
+                    await switch_tabs()
+
+                    # 2. Calculate full height for screenshot
+                    # @UX_STATE: [CalculatingHeight] -> Determining dashboard dimensions
+                    full_height = await page.evaluate("""() => {
+                        const body = document.body;
+                        const html = document.documentElement;
+                        const dashboardContent = document.querySelector('.dashboard-content');
+                        
+                        return Math.max(
+                            body.scrollHeight, body.offsetHeight,
+                            html.clientHeight, html.scrollHeight, html.offsetHeight,
+                            dashboardContent ? dashboardContent.scrollHeight + 100 : 0
+                        );
+                    }""")
+                    logger.info(f"[DEBUG] Calculated full height: {full_height}")
+                    
+                    # DIAGNOSTIC: Count chart elements before resize
+                    chart_count_before = await page.evaluate("""() => {
+                        return {
+                            chartContainers: document.querySelectorAll('.chart-container, .slice_container').length,
+                            canvasElements: document.querySelectorAll('canvas').length,
+                            svgElements: document.querySelectorAll('.chart-container svg, .slice_container svg').length,
+                            visibleCharts: document.querySelectorAll('.chart-container:visible, .slice_container:visible').length
+                        };
+                    }""")
+                    logger.info(f"[DIAGNOSTIC] Chart elements BEFORE viewport resize: {chart_count_before}")
+                    
+                    # DIAGNOSTIC: Capture pre-resize screenshot for comparison
+                    pre_resize_path = output_path.replace(".png", "_preresize.png")
+                    try:
+                        await page.screenshot(path=pre_resize_path, full_page=False, timeout=10000)
+                        import os
+                        pre_resize_size = os.path.getsize(pre_resize_path) if os.path.exists(pre_resize_path) else 0
+                        logger.info(f"[DIAGNOSTIC] Pre-resize screenshot saved: {pre_resize_path} ({pre_resize_size} bytes)")
+                    except Exception as pre_e:
+                        logger.warning(f"[DIAGNOSTIC] Failed to capture pre-resize screenshot: {pre_e}")
+                    
+                    logger.info(f"[DIAGNOSTIC] Resizing viewport from current to 1920x{int(full_height)}")
+                    await page.set_viewport_size({"width": 1920, "height": int(full_height)})
+                    
+                    # DIAGNOSTIC: Increased wait time and log timing
+                    logger.info("[DIAGNOSTIC] Waiting 10 seconds after viewport resize for re-render...")
+                    await asyncio.sleep(10)
+                    logger.info("[DIAGNOSTIC] Wait completed")
+                    
+                    # DIAGNOSTIC: Count chart elements after resize and wait
+                    chart_count_after = await page.evaluate("""() => {
+                        return {
+                            chartContainers: document.querySelectorAll('.chart-container, .slice_container').length,
+                            canvasElements: document.querySelectorAll('canvas').length,
+                            svgElements: document.querySelectorAll('.chart-container svg, .slice_container svg').length,
+                            visibleCharts: document.querySelectorAll('.chart-container:visible, .slice_container:visible').length
+                        };
+                    }""")
+                    logger.info(f"[DIAGNOSTIC] Chart elements AFTER viewport resize + wait: {chart_count_after}")
+                    
+                    # DIAGNOSTIC: Check if any charts have error states
+                    chart_errors = await page.evaluate("""() => {
+                        const errors = [];
+                        document.querySelectorAll('.chart-container, .slice_container').forEach((chart, i) => {
+                            const errorEl = chart.querySelector('.error, .alert-danger, .ant-alert-error');
+                            if (errorEl) {
+                                errors.push({index: i, text: errorEl.innerText.substring(0, 100)});
+                            }
+                        });
+                        return errors;
+                    }""")
+                    if chart_errors:
+                        logger.warning(f"[DIAGNOSTIC] Charts with error states detected: {chart_errors}")
+                    else:
+                        logger.info("[DIAGNOSTIC] No chart error states detected")
+
+                    # 3. Take screenshot using CDP to bypass Playwright's font loading wait
+                    # @UX_STATE: [Capturing] -> Executing CDP screenshot
+                    logger.info("[DEBUG] Attempting full-page screenshot via CDP...")
+                    cdp = await page.context.new_cdp_session(page)
+                    
+                    screenshot_data = await cdp.send("Page.captureScreenshot", {
+                        "format": "png",
+                        "fromSurface": True,
+                        "captureBeyondViewport": True
+                    })
+                    
+                    image_data = base64.b64decode(screenshot_data["data"])
+                    
+                    with open(output_path, 'wb') as f:
+                        f.write(image_data)
+                    
+                    # DIAGNOSTIC: Verify screenshot file
+                    import os
+                    final_size = os.path.getsize(output_path) if os.path.exists(output_path) else 0
+                    logger.info(f"[DIAGNOSTIC] Final screenshot saved: {output_path}")
+                    logger.info(f"[DIAGNOSTIC] Final screenshot size: {final_size} bytes ({final_size / 1024:.2f} KB)")
+                    
+                    # DIAGNOSTIC: Get image dimensions
+                    try:
+                        with Image.open(output_path) as final_img:
+                            logger.info(f"[DIAGNOSTIC] Final screenshot dimensions: {final_img.width}x{final_img.height}")
+                    except Exception as img_err:
+                        logger.warning(f"[DIAGNOSTIC] Could not read final image dimensions: {img_err}")
+                        
+                    logger.info(f"Full-page screenshot saved to {output_path} (via CDP)")
+                except Exception as e:
+                    logger.error(f"[DEBUG] Full-page/Tab capture failed: {e}")
+                    try:
+                        await page.screenshot(path=output_path, full_page=True, timeout=10000)
+                    except Exception as e2:
+                        logger.error(f"[DEBUG] Fallback screenshot also failed: {e2}")
+                        await page.screenshot(path=output_path, timeout=5000)
                
-                await page.screenshot(path=output_path, full_page=True)
                await browser.close()
-                logger.info(f"Screenshot saved to {output_path}")
                return True
+    # [/DEF:ScreenshotService.capture_dashboard:Function]
 # [/DEF:ScreenshotService:Class]

 # [DEF:LLMClient:Class]
 # @PURPOSE: Wrapper for LLM provider APIs.
 class LLMClient:
+    # [DEF:LLMClient.__init__:Function]
+    # @PURPOSE: Initializes the LLMClient with provider settings.
+    # @PRE: api_key, base_url, and default_model are non-empty strings.
    def __init__(self, provider_type: LLMProviderType, api_key: str, base_url: str, default_model: str):
        self.provider_type = provider_type
        self.api_key = api_key
        self.base_url = base_url
        self.default_model = default_model
+        
+        # DEBUG: Log initialization parameters (without exposing full API key)
+        logger.info(f"[LLMClient.__init__] Initializing LLM client:")
+        logger.info(f"[LLMClient.__init__]   Provider Type: {provider_type}")
+        logger.info(f"[LLMClient.__init__]   Base URL: {base_url}")
+        logger.info(f"[LLMClient.__init__]   Default Model: {default_model}")
+        logger.info(f"[LLMClient.__init__]   API Key (first 8 chars): {api_key[:8] if api_key and len(api_key) > 8 else 'EMPTY_OR_NONE'}...")
+        logger.info(f"[LLMClient.__init__]   API Key Length: {len(api_key) if api_key else 0}")
+        
        self.client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+    # [/DEF:LLMClient.__init__:Function]

-    # [DEF:analyze_dashboard:Function]
-    # @PURPOSE: Sends dashboard data to LLM for analysis.
+    # [DEF:LLMClient.get_json_completion:Function]
+    # @PURPOSE: Helper to handle LLM calls with JSON mode and fallback parsing.
+    # @PRE: messages is a list of valid message dictionaries.
+    # @POST: Returns a parsed JSON dictionary.
+    # @SIDE_EFFECT: Calls external LLM API.
+    def _should_retry(exception: Exception) -> bool:
+        """Custom retry predicate that excludes authentication errors."""
+        # Don't retry on authentication errors
+        if isinstance(exception, OpenAIAuthenticationError):
+            return False
+        # Retry on rate limit errors and other exceptions
+        return isinstance(exception, (RateLimitError, Exception))
+    
    @retry(
        stop=stop_after_attempt(5),
        wait=wait_exponential(multiplier=2, min=5, max=60),
-        retry=retry_if_exception_type((Exception, RateLimitError))
+        retry=retry_if_exception(_should_retry),
+        reraise=True
    )
+    async def get_json_completion(self, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+        with belief_scope("get_json_completion"):
+            response = None
+            try:
+                try:
+                    logger.info(f"[get_json_completion] Attempting LLM call with JSON mode for model: {self.default_model}")
+                    logger.info(f"[get_json_completion] Base URL being used: {self.base_url}")
+                    logger.info(f"[get_json_completion] Number of messages: {len(messages)}")
+                    logger.info(f"[get_json_completion] API Key present: {bool(self.api_key and len(self.api_key) > 0)}")
+                    
+                    response = await self.client.chat.completions.create(
+                        model=self.default_model,
+                        messages=messages,
+                        response_format={"type": "json_object"}
+                    )
+                except Exception as e:
+                    if "JSON mode is not enabled" in str(e) or "400" in str(e):
+                        logger.warning(f"[get_json_completion] JSON mode failed or not supported: {str(e)}. Falling back to plain text response.")
+                        response = await self.client.chat.completions.create(
+                            model=self.default_model,
+                            messages=messages
+                        )
+                    else:
+                        raise e
+                
+                logger.debug(f"[get_json_completion] LLM Response: {response}")
+            except OpenAIAuthenticationError as e:
+                logger.error(f"[get_json_completion] Authentication error: {str(e)}")
+                # Do not retry on auth errors - re-raise to stop retry
+                raise
+            except RateLimitError as e:
+                logger.warning(f"[get_json_completion] Rate limit hit: {str(e)}")
+                
+                # Extract retry_delay from error metadata if available
+                retry_delay = 5.0 # Default fallback
+                try:
+                    # Based on logs, the raw response is in e.body or e.response.json()
+                    # The logs show 'metadata': {'raw': '...'} which suggests a proxy or specific client wrapper
+                    # Let's try to find the 'retryDelay' in the error message or response
+                    import re
+                    
+                    # Try to find "retryDelay": "XXs" in the string representation of the error
+                    error_str = str(e)
+                    match = re.search(r'"retryDelay":\s*"(\d+)s"', error_str)
+                    if match:
+                        retry_delay = float(match.group(1))
+                    else:
+                        # Try to parse from response if it's a standard OpenAI-like error with body
+                        if hasattr(e, 'body') and isinstance(e.body, dict):
+                            # Some providers put it in details
+                            details = e.body.get('error', {}).get('details', [])
+                            for detail in details:
+                                if detail.get('@type') == 'type.googleapis.com/google.rpc.RetryInfo':
+                                    delay_str = detail.get('retryDelay', '5s')
+                                    retry_delay = float(delay_str.rstrip('s'))
+                                    break
+                except Exception as parse_e:
+                    logger.debug(f"[get_json_completion] Failed to parse retry delay: {parse_e}")
+                
+                # Add a small safety margin (0.5s) as requested
+                wait_time = retry_delay + 0.5
+                logger.info(f"[get_json_completion] Waiting for {wait_time}s before retry...")
+                await asyncio.sleep(wait_time)
+                raise
+            except Exception as e:
+                logger.error(f"[get_json_completion] LLM call failed: {str(e)}")
+                raise
+
+            if not response or not hasattr(response, 'choices') or not response.choices:
+                raise RuntimeError(f"Invalid LLM response: {response}")
+
+            content = response.choices[0].message.content
+            logger.debug(f"[get_json_completion] Raw content to parse: {content}")
+            
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                logger.warning("[get_json_completion] Failed to parse JSON directly, attempting to extract from code blocks")
+                if "```json" in content:
+                    json_str = content.split("```json")[1].split("```")[0].strip()
+                    return json.loads(json_str)
+                elif "```" in content:
+                    json_str = content.split("```")[1].split("```")[0].strip()
+                    return json.loads(json_str)
+                else:
+                    raise
+    # [/DEF:LLMClient.get_json_completion:Function]
+
+    # [DEF:LLMClient.analyze_dashboard:Function]
+    # @PURPOSE: Sends dashboard data (screenshot + logs) to LLM for health analysis.
+    # @PRE: screenshot_path exists, logs is a list of strings.
+    # @POST: Returns a structured analysis dictionary (status, summary, issues).
+    # @SIDE_EFFECT: Reads screenshot file and calls external LLM API.
    async def analyze_dashboard(self, screenshot_path: str, logs: List[str]) -> Dict[str, Any]:
        with belief_scope("analyze_dashboard"):
-            import base64
-            with open(screenshot_path, "rb") as image_file:
-                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+            # Optimize image to reduce token count (US1 / T023)
+            # Gemini/Gemma models have limits on input tokens, and large images contribute significantly.
+            try:
+                with Image.open(screenshot_path) as img:
+                    # Convert to RGB if necessary
+                    if img.mode in ("RGBA", "P"):
+                        img = img.convert("RGB")
+                    
+                    # Resize if too large (max 1024px width while maintaining aspect ratio)
+                    # We reduce width further to 1024px to stay within token limits for long dashboards
+                    max_width = 1024
+                    if img.width > max_width or img.height > 2048:
+                        # Calculate scaling factor to fit within 1024x2048
+                        scale = min(max_width / img.width, 2048 / img.height)
+                        if scale < 1.0:
+                            new_width = int(img.width * scale)
+                            new_height = int(img.height * scale)
+                            img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                            logger.info(f"[analyze_dashboard] Resized image from {img.width}x{img.height} to {new_width}x{new_height}")
+                    
+                    # Compress and convert to base64
+                    buffer = io.BytesIO()
+                    # Lower quality to 60% to further reduce payload size
+                    img.save(buffer, format="JPEG", quality=60, optimize=True)
+                    base_64_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
+                    logger.info(f"[analyze_dashboard] Optimized image size: {len(buffer.getvalue()) / 1024:.2f} KB")
+            except Exception as img_e:
+                logger.warning(f"[analyze_dashboard] Image optimization failed: {img_e}. Using raw image.")
+                with open(screenshot_path, "rb") as image_file:
+                    base_64_image = base64.b64encode(image_file.read()).decode('utf-8')

            log_text = "\n".join(logs)
            prompt = f"""
@@ -177,48 +599,31 @@ class LLMClient:
            }}
            """
            
-            logger.debug(f"[analyze_dashboard] Calling LLM with model: {self.default_model}")
-            try:
-                response = await self.client.chat.completions.create(
-                    model=self.default_model,
-                    messages=[
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "text", "text": prompt},
-                                {
-                                    "type": "image_url",
-                                    "image_url": {
-                                        "url": f"data:image/jpeg;base64,{base64_image}"
-                                    }
-                                }
-                            ]
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base_64_image}"
+                            }
                        }
-                    ],
-                    response_format={"type": "json_object"}
-                )
-                logger.debug(f"[analyze_dashboard] LLM Response: {response}")
-            except RateLimitError as e:
-                logger.warning(f"[analyze_dashboard] Rate limit hit: {str(e)}")
-                raise # tenacity will handle retry
-            except Exception as e:
-                logger.error(f"[analyze_dashboard] LLM call failed: {str(e)}")
-                raise
+                    ]
+                }
+            ]
            
-            if not response or not hasattr(response, 'choices') or not response.choices:
-                error_info = getattr(response, 'error', 'No choices in response')
-                logger.error(f"[analyze_dashboard] Invalid LLM response. Error info: {error_info}")
+            try:
+                return await self.get_json_completion(messages)
+            except Exception as e:
+                logger.error(f"[analyze_dashboard] Failed to get analysis: {str(e)}")
                return {
                    "status": "FAIL",
-                    "summary": f"Failed to get response from LLM: {error_info}",
+                    "summary": f"Failed to get response from LLM: {str(e)}",
                    "issues": [{"severity": "FAIL", "message": "LLM provider returned empty or invalid response"}]
                }
-
-            import json
-            result = json.loads(response.choices[0].message.content)
-            return result
-    # [/DEF:analyze_dashboard:Function]
-
+    # [/DEF:LLMClient.analyze_dashboard:Function]
 # [/DEF:LLMClient:Class]

-# [/DEF:backend.src.plugins.llm_analysis.service:Module]
+# [/DEF:backend/src/plugins/llm_analysis/service.py:Module]
--- a/backend/src/services/llm_provider.py
+++ b/backend/src/services/llm_provider.py
@@ -19,7 +19,7 @@ import os
 class EncryptionManager:
    # @INVARIANT: Uses a secret key from environment or a default one (fallback only for dev).
    def __init__(self):
-        self.key = os.getenv("ENCRYPTION_KEY", "7_u-l7-B-j9f5_V5z-5-5-5-5-5-5-5-5-5-5-5-5-5=").encode()
+        self.key = os.getenv("ENCRYPTION_KEY", "ZcytYzi0iHIl4Ttr-GdAEk117aGRogkGvN3wiTxrPpE=").encode()
        self.fernet = Fernet(self.key)

    def encrypt(self, data: str) -> str:
@@ -80,7 +80,8 @@ class LLMProviderService:
            db_provider.provider_type = config.provider_type.value
            db_provider.name = config.name
            db_provider.base_url = config.base_url
-            if config.api_key != "********":
+            # Only update API key if provided (not None and not empty)
+            if config.api_key is not None and config.api_key != "":
                db_provider.api_key = self.encryption.encrypt(config.api_key)
            db_provider.default_model = config.default_model
            db_provider.is_active = config.is_active
@@ -108,8 +109,19 @@ class LLMProviderService:
        with belief_scope("get_decrypted_api_key"):
            db_provider = self.get_provider(provider_id)
            if not db_provider:
+                logger.warning(f"[get_decrypted_api_key] Provider {provider_id} not found in database")
+                return None
+            
+            logger.info(f"[get_decrypted_api_key] Provider found: {db_provider.id}")
+            logger.info(f"[get_decrypted_api_key] Encrypted API key length: {len(db_provider.api_key) if db_provider.api_key else 0}")
+            
+            try:
+                decrypted_key = self.encryption.decrypt(db_provider.api_key)
+                logger.info(f"[get_decrypted_api_key] Decryption successful, key length: {len(decrypted_key) if decrypted_key else 0}")
+                return decrypted_key
+            except Exception as e:
+                logger.error(f"[get_decrypted_api_key] Decryption failed: {str(e)}")
                return None
-            return self.encryption.decrypt(db_provider.api_key)
    # [/DEF:get_decrypted_api_key:Function]

 # [/DEF:LLMProviderService:Class]