Вроде работает

2026-01-30 11:10:16 +03:00
parent 8044f85ea4
commit 252a8601a9
43 changed files with 1987 additions and 270 deletions
--- a/backend/src/plugins/llm_analysis/init.py
+++ b/backend/src/plugins/llm_analysis/init.py
@@ -0,0 +1,11 @@
+# [DEF:backend/src/plugins/llm_analysis/__init__.py:Module]
+# @TIER: TRIVIAL
+# @PURPOSE: Initialize the LLM Analysis plugin package.
+
+"""
+LLM Analysis Plugin for automated dashboard validation and dataset documentation.
+"""
+
+from .plugin import DashboardValidationPlugin, DocumentationPlugin
+
+# [/DEF:backend/src/plugins/llm_analysis/__init__.py]
--- a/backend/src/plugins/llm_analysis/models.py
+++ b/backend/src/plugins/llm_analysis/models.py
@@ -0,0 +1,61 @@
+# [DEF:backend/src/plugins/llm_analysis/models.py:Module]
+# @TIER: STANDARD
+# @SEMANTICS: pydantic, models, llm
+# @PURPOSE: Define Pydantic models for LLM Analysis plugin.
+# @LAYER: Domain
+
+from typing import List, Optional
+from pydantic import BaseModel, Field
+from datetime import datetime
+from enum import Enum
+
+# [DEF:LLMProviderType:Class]
+# @PURPOSE: Enum for supported LLM providers.
+class LLMProviderType(str, Enum):
+    OPENAI = "openai"
+    OPENROUTER = "openrouter"
+    KILO = "kilo"
+# [/DEF:LLMProviderType:Class]
+
+# [DEF:LLMProviderConfig:Class]
+# @PURPOSE: Configuration for an LLM provider.
+class LLMProviderConfig(BaseModel):
+    id: Optional[str] = None
+    provider_type: LLMProviderType
+    name: str
+    base_url: str
+    api_key: str
+    default_model: str
+    is_active: bool = True
+# [/DEF:LLMProviderConfig:Class]
+
+# [DEF:ValidationStatus:Class]
+# @PURPOSE: Enum for dashboard validation status.
+class ValidationStatus(str, Enum):
+    PASS = "PASS"
+    WARN = "WARN"
+    FAIL = "FAIL"
+# [/DEF:ValidationStatus:Class]
+
+# [DEF:DetectedIssue:Class]
+# @PURPOSE: Model for a single issue detected during validation.
+class DetectedIssue(BaseModel):
+    severity: ValidationStatus
+    message: str
+    location: Optional[str] = None
+# [/DEF:DetectedIssue:Class]
+
+# [DEF:ValidationResult:Class]
+# @PURPOSE: Model for dashboard validation result.
+class ValidationResult(BaseModel):
+    id: Optional[str] = None
+    dashboard_id: str
+    timestamp: datetime = Field(default_factory=datetime.utcnow)
+    status: ValidationStatus
+    screenshot_path: Optional[str] = None
+    issues: List[DetectedIssue]
+    summary: str
+    raw_response: Optional[str] = None
+# [/DEF:ValidationResult:Class]
+
+# [/DEF:backend/src/plugins/llm_analysis/models.py]
--- a/backend/src/plugins/llm_analysis/plugin.py
+++ b/backend/src/plugins/llm_analysis/plugin.py
@@ -0,0 +1,272 @@
+# [DEF:backend.src.plugins.llm_analysis.plugin:Module]
+# @TIER: STANDARD
+# @SEMANTICS: plugin, llm, analysis, documentation
+# @PURPOSE: Implements DashboardValidationPlugin and DocumentationPlugin.
+# @LAYER: Domain
+# @RELATION: INHERITS_FROM -> backend.src.core.plugin_base.PluginBase
+
+from typing import Dict, Any, Optional, List
+import os
+from datetime import datetime, timedelta
+from ...core.plugin_base import PluginBase
+from ...core.logger import belief_scope, logger
+from ...core.database import SessionLocal
+from ...core.config_manager import ConfigManager
+from ...services.llm_provider import LLMProviderService
+from .service import ScreenshotService, LLMClient
+from .models import LLMProviderType, ValidationStatus, ValidationResult, DetectedIssue
+from ...models.llm import ValidationRecord
+
+# [DEF:DashboardValidationPlugin:Class]
+# @PURPOSE: Plugin for automated dashboard health analysis using LLMs.
+class DashboardValidationPlugin(PluginBase):
+    @property
+    def id(self) -> str:
+        return "llm_dashboard_validation"
+
+    @property
+    def name(self) -> str:
+        return "Dashboard LLM Validation"
+
+    @property
+    def description(self) -> str:
+        return "Automated dashboard health analysis using multimodal LLMs."
+
+    @property
+    def version(self) -> str:
+        return "1.0.0"
+
+    def get_schema(self) -> Dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "dashboard_id": {"type": "string", "title": "Dashboard ID"},
+                "environment_id": {"type": "string", "title": "Environment ID"},
+                "provider_id": {"type": "string", "title": "LLM Provider ID"}
+            },
+            "required": ["dashboard_id", "environment_id", "provider_id"]
+        }
+
+    async def execute(self, params: Dict[str, Any]):
+        with belief_scope("execute", f"plugin_id={self.id}"):
+            logger.info(f"Executing {self.name} with params: {params}")
+            
+            dashboard_id = params.get("dashboard_id")
+            env_id = params.get("environment_id")
+            provider_id = params.get("provider_id")
+            task_id = params.get("_task_id")
+
+            db = SessionLocal()
+            try:
+                # 1. Get Environment
+                from ...dependencies import get_config_manager
+                config_mgr = get_config_manager()
+                env = config_mgr.get_environment(env_id)
+                if not env:
+                    raise ValueError(f"Environment {env_id} not found")
+
+                # 2. Get LLM Provider
+                llm_service = LLMProviderService(db)
+                db_provider = llm_service.get_provider(provider_id)
+                if not db_provider:
+                    raise ValueError(f"LLM Provider {provider_id} not found")
+                
+                api_key = llm_service.get_decrypted_api_key(provider_id)
+
+                # 3. Capture Screenshot
+                screenshot_service = ScreenshotService(env)
+                os.makedirs("ss-tools-storage/screenshots", exist_ok=True)
+                screenshot_path = f"ss-tools-storage/screenshots/{dashboard_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
+                
+                await screenshot_service.capture_dashboard(dashboard_id, screenshot_path)
+
+                # 4. Fetch Logs (Last 100 lines from backend.log)
+                logs = []
+                log_file = "backend.log"
+                if os.path.exists(log_file):
+                    with open(log_file, "r") as f:
+                        # Read last 100 lines
+                        all_lines = f.readlines()
+                        logs = all_lines[-100:]
+                
+                if not logs:
+                    logs = ["No logs found in backend.log"]
+
+                # 5. Analyze with LLM
+                llm_client = LLMClient(
+                    provider_type=LLMProviderType(db_provider.provider_type),
+                    api_key=api_key,
+                    base_url=db_provider.base_url,
+                    default_model=db_provider.default_model
+                )
+                
+                analysis = await llm_client.analyze_dashboard(screenshot_path, logs)
+
+                # 6. Persist Result
+                validation_result = ValidationResult(
+                    dashboard_id=dashboard_id,
+                    status=ValidationStatus(analysis["status"]),
+                    summary=analysis["summary"],
+                    issues=[DetectedIssue(**issue) for issue in analysis["issues"]],
+                    screenshot_path=screenshot_path,
+                    raw_response=str(analysis)
+                )
+
+                db_record = ValidationRecord(
+                    dashboard_id=validation_result.dashboard_id,
+                    status=validation_result.status.value,
+                    summary=validation_result.summary,
+                    issues=[issue.dict() for issue in validation_result.issues],
+                    screenshot_path=validation_result.screenshot_path,
+                    raw_response=validation_result.raw_response
+                )
+                db.add(db_record)
+                db.commit()
+
+                # 7. Notification on failure (US1 / FR-015)
+                if validation_result.status == ValidationStatus.FAIL:
+                    logger.warning(f"Dashboard {dashboard_id} validation FAILED. Summary: {validation_result.summary}")
+                    # Placeholder for Email/Pulse notification dispatch
+                    # In a real implementation, we would call a NotificationService here
+                    # with a payload containing the summary and a link to the report.
+
+                return validation_result.dict()
+
+            finally:
+                db.close()
+# [/DEF:DashboardValidationPlugin:Class]
+
+# [DEF:DocumentationPlugin:Class]
+# @PURPOSE: Plugin for automated dataset documentation using LLMs.
+class DocumentationPlugin(PluginBase):
+    @property
+    def id(self) -> str:
+        return "llm_documentation"
+
+    @property
+    def name(self) -> str:
+        return "Dataset LLM Documentation"
+
+    @property
+    def description(self) -> str:
+        return "Automated dataset and column documentation using LLMs."
+
+    @property
+    def version(self) -> str:
+        return "1.0.0"
+
+    def get_schema(self) -> Dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "dataset_id": {"type": "string", "title": "Dataset ID"},
+                "environment_id": {"type": "string", "title": "Environment ID"},
+                "provider_id": {"type": "string", "title": "LLM Provider ID"}
+            },
+            "required": ["dataset_id", "environment_id", "provider_id"]
+        }
+
+    async def execute(self, params: Dict[str, Any]):
+        with belief_scope("execute", f"plugin_id={self.id}"):
+            logger.info(f"Executing {self.name} with params: {params}")
+            
+            dataset_id = params.get("dataset_id")
+            env_id = params.get("environment_id")
+            provider_id = params.get("provider_id")
+
+            db = SessionLocal()
+            try:
+                # 1. Get Environment
+                from ...dependencies import get_config_manager
+                config_mgr = get_config_manager()
+                env = config_mgr.get_environment(env_id)
+                if not env:
+                    raise ValueError(f"Environment {env_id} not found")
+
+                # 2. Get LLM Provider
+                llm_service = LLMProviderService(db)
+                db_provider = llm_service.get_provider(provider_id)
+                if not db_provider:
+                    raise ValueError(f"LLM Provider {provider_id} not found")
+                
+                api_key = llm_service.get_decrypted_api_key(provider_id)
+
+                # 3. Fetch Metadata (US2 / T024)
+                from ...core.superset_client import SupersetClient
+                client = SupersetClient(env)
+                
+                # Optimistic locking check (T045)
+                dataset = client.get_dataset(int(dataset_id))
+                # dataset structure might vary, ensure we get the right field
+                original_changed_on = dataset.get("changed_on_utc") or dataset.get("result", {}).get("changed_on_utc")
+                
+                # Extract columns and existing descriptions
+                columns_data = []
+                for col in dataset.get("columns", []):
+                    columns_data.append({
+                        "name": col.get("column_name"),
+                        "type": col.get("type"),
+                        "description": col.get("description")
+                    })
+
+                # 4. Construct Prompt & Analyze (US2 / T025)
+                llm_client = LLMClient(
+                    provider_type=LLMProviderType(db_provider.provider_type),
+                    api_key=api_key,
+                    base_url=db_provider.base_url,
+                    default_model=db_provider.default_model
+                )
+                
+                prompt = f"""
+                Generate professional documentation for the following dataset and its columns.
+                Dataset: {dataset.get('table_name')}
+                Columns: {columns_data}
+                
+                Provide the documentation in JSON format:
+                {{
+                    "dataset_description": "General description of the dataset",
+                    "column_descriptions": [
+                        {{
+                            "name": "column_name",
+                            "description": "Generated description"
+                        }}
+                    ]
+                }}
+                """
+                
+                # Using a generic chat completion for text-only US2
+                response = await llm_client.client.chat.completions.create(
+                    model=db_provider.default_model,
+                    messages=[{"role": "user", "content": prompt}],
+                    response_format={"type": "json_object"}
+                )
+                
+                import json
+                doc_result = json.loads(response.choices[0].message.content)
+
+                # 5. Update Metadata (US2 / T026)
+                # This part normally goes to mapping_service, but we implement the logic here for the plugin flow
+                # We'll update the dataset in Superset
+                update_payload = {
+                    "description": doc_result["dataset_description"],
+                    "columns": []
+                }
+                
+                # Map generated descriptions back to column IDs
+                for col_doc in doc_result["column_descriptions"]:
+                    for col in dataset.get("columns", []):
+                        if col.get("column_name") == col_doc["name"]:
+                            update_payload["columns"].append({
+                                "id": col.get("id"),
+                                "description": col_doc["description"]
+                            })
+
+                client.update_dataset(int(dataset_id), update_payload)
+                
+                return doc_result
+
+            finally:
+                db.close()
+# [/DEF:DocumentationPlugin:Class]
+
+# [/DEF:backend.src.plugins.llm_analysis.plugin:Module]
--- a/backend/src/plugins/llm_analysis/scheduler.py
+++ b/backend/src/plugins/llm_analysis/scheduler.py
@@ -0,0 +1,56 @@
+# [DEF:backend/src/plugins/llm_analysis/scheduler.py:Module]
+# @TIER: STANDARD
+# @SEMANTICS: scheduler, task, automation
+# @PURPOSE: Provides helper functions to schedule LLM-based validation tasks.
+# @LAYER: Domain
+# @RELATION: DEPENDS_ON -> backend.src.core.scheduler
+
+from typing import Dict, Any
+from ...dependencies import get_task_manager, get_scheduler_service
+from ...core.logger import belief_scope, logger
+
+# [DEF:schedule_dashboard_validation:Function]
+# @PURPOSE: Schedules a recurring dashboard validation task.
+# @PARAM: dashboard_id (str) - ID of the dashboard to validate.
+# @PARAM: cron_expression (str) - Standard cron expression for scheduling.
+# @PARAM: params (Dict[str, Any]) - Task parameters (environment_id, provider_id).
+def schedule_dashboard_validation(dashboard_id: str, cron_expression: str, params: Dict[str, Any]):
+    with belief_scope("schedule_dashboard_validation", f"dashboard_id={dashboard_id}"):
+        scheduler = get_scheduler_service()
+        task_manager = get_task_manager()
+        
+        job_id = f"llm_val_{dashboard_id}"
+        
+        async def job_func():
+            await task_manager.create_task(
+                plugin_id="llm_dashboard_validation",
+                params={
+                    "dashboard_id": dashboard_id,
+                    **params
+                }
+            )
+
+        scheduler.add_job(
+            job_func,
+            "cron",
+            id=job_id,
+            replace_existing=True,
+            **_parse_cron(cron_expression)
+        )
+        logger.info(f"Scheduled validation for dashboard {dashboard_id} with cron {cron_expression}")
+
+def _parse_cron(cron: str) -> Dict[str, str]:
+    # Basic cron parser placeholder
+    parts = cron.split()
+    if len(parts) != 5:
+        return {}
+    return {
+        "minute": parts[0],
+        "hour": parts[1],
+        "day": parts[2],
+        "month": parts[3],
+        "day_of_week": parts[4]
+    }
+# [/DEF:schedule_dashboard_validation:Function]
+
+# [/DEF:backend/src/plugins/llm_analysis/scheduler.py]
--- a/backend/src/plugins/llm_analysis/service.py
+++ b/backend/src/plugins/llm_analysis/service.py
@@ -0,0 +1,224 @@
+# [DEF:backend.src.plugins.llm_analysis.service:Module]
+# @TIER: STANDARD
+# @SEMANTICS: service, llm, screenshot, playwright, openai
+# @PURPOSE: Services for LLM interaction and dashboard screenshots.
+# @LAYER: Domain
+# @RELATION: DEPENDS_ON -> playwright
+# @RELATION: DEPENDS_ON -> openai
+# @RELATION: DEPENDS_ON -> tenacity
+
+import asyncio
+from typing import List, Optional, Dict, Any
+from playwright.async_api import async_playwright
+from openai import AsyncOpenAI, RateLimitError
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
+from .models import LLMProviderType, ValidationResult, ValidationStatus, DetectedIssue
+from ...core.logger import belief_scope, logger
+from ...core.config_models import Environment
+
+# [DEF:ScreenshotService:Class]
+# @PURPOSE: Handles capturing screenshots of Superset dashboards.
+class ScreenshotService:
+    # @PRE: env is a valid Environment object.
+    def __init__(self, env: Environment):
+        self.env = env
+
+    # [DEF:capture_dashboard:Function]
+    # @PURPOSE: Captures a screenshot of a dashboard using Playwright.
+    # @PARAM: dashboard_id (str) - ID of the dashboard.
+    # @PARAM: output_path (str) - Path to save the screenshot.
+    # @RETURN: bool - True if successful.
+    async def capture_dashboard(self, dashboard_id: str, output_path: str) -> bool:
+        with belief_scope("capture_dashboard", f"dashboard_id={dashboard_id}"):
+            logger.info(f"Capturing screenshot for dashboard {dashboard_id}")
+            async with async_playwright() as p:
+                browser = await p.chromium.launch(headless=True)
+                context = await browser.new_context(viewport={'width': 1280, 'height': 720})
+                page = await context.new_page()
+                
+                # 1. Authenticate via API to get tokens
+                from ...core.superset_client import SupersetClient
+                client = SupersetClient(self.env)
+                try:
+                    tokens = client.authenticate()
+                    access_token = tokens.get("access_token")
+                    
+                    # Set JWT in localStorage if possible, or use as cookie
+                    # Superset UI uses session cookies, but we can try to set the Authorization header
+                    # or inject the token into the session.
+                    # For now, we'll use the token to set a cookie if we can determine the name,
+                    # but the most reliable way for Playwright is often still the UI login
+                    # UNLESS we use the API to set a session cookie.
+                    logger.info("API Authentication successful")
+                except Exception as e:
+                    logger.warning(f"API Authentication failed: {e}. Falling back to UI login.")
+
+                # 2. Navigate to dashboard
+                dashboard_url = f"{self.env.url}/superset/dashboard/{dashboard_id}/"
+                logger.info(f"Navigating to {dashboard_url}")
+                
+                # We still go to the URL first
+                await page.goto(dashboard_url)
+                await page.wait_for_load_state("networkidle")
+
+                # 3. Check if we are redirected to login
+                if "/login" in page.url:
+                    logger.info(f"Redirected to login: {page.url}. Filling credentials from Environment.")
+                    
+                    # More exhaustive list of selectors for various Superset versions/themes
+                    selectors = {
+                        "username": ['input[name="username"]', 'input#username', 'input[placeholder*="Username"]'],
+                        "password": ['input[name="password"]', 'input#password', 'input[placeholder*="Password"]'],
+                        "submit": ['button[type="submit"]', 'button#submit', '.btn-primary']
+                    }
+                    
+                    try:
+                        # Find and fill username
+                        u_selector = None
+                        for s in selectors["username"]:
+                            if await page.locator(s).count() > 0:
+                                u_selector = s
+                                break
+                        
+                        if not u_selector:
+                            raise RuntimeError("Could not find username input field")
+                            
+                        await page.fill(u_selector, self.env.username)
+                        
+                        # Find and fill password
+                        p_selector = None
+                        for s in selectors["password"]:
+                            if await page.locator(s).count() > 0:
+                                p_selector = s
+                                break
+                        
+                        if not p_selector:
+                            raise RuntimeError("Could not find password input field")
+                            
+                        await page.fill(p_selector, self.env.password)
+                        
+                        # Click submit
+                        s_selector = selectors["submit"][0]
+                        for s in selectors["submit"]:
+                            if await page.locator(s).count() > 0:
+                                s_selector = s
+                                break
+                        
+                        await page.click(s_selector)
+                        await page.wait_for_load_state("networkidle")
+                        
+                        # Re-verify we are at the dashboard
+                        if "/login" in page.url:
+                            # Check for error messages on page
+                            error_msg = await page.locator(".alert-danger, .error-message").text_content() if await page.locator(".alert-danger, .error-message").count() > 0 else "Unknown error"
+                            raise RuntimeError(f"Login failed after submission: {error_msg}")
+
+                        if "/superset/dashboard" not in page.url:
+                            logger.info(f"Redirecting back to dashboard after login: {dashboard_url}")
+                            await page.goto(dashboard_url)
+                            await page.wait_for_load_state("networkidle")
+                            
+                    except Exception as e:
+                        page_title = await page.title()
+                        logger.error(f"UI Login failed. Page title: {page_title}, URL: {page.url}, Error: {str(e)}")
+                        debug_path = output_path.replace(".png", "_debug_failed_login.png")
+                        await page.screenshot(path=debug_path)
+                        raise RuntimeError(f"Login failed: {str(e)}. Debug screenshot saved to {debug_path}")
+                # Wait a bit more for charts to render
+                await asyncio.sleep(5)
+                
+                await page.screenshot(path=output_path, full_page=True)
+                await browser.close()
+                logger.info(f"Screenshot saved to {output_path}")
+                return True
+# [/DEF:ScreenshotService:Class]
+
+# [DEF:LLMClient:Class]
+# @PURPOSE: Wrapper for LLM provider APIs.
+class LLMClient:
+    def __init__(self, provider_type: LLMProviderType, api_key: str, base_url: str, default_model: str):
+        self.provider_type = provider_type
+        self.api_key = api_key
+        self.base_url = base_url
+        self.default_model = default_model
+        self.client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+
+    # [DEF:analyze_dashboard:Function]
+    # @PURPOSE: Sends dashboard data to LLM for analysis.
+    @retry(
+        stop=stop_after_attempt(5),
+        wait=wait_exponential(multiplier=2, min=5, max=60),
+        retry=retry_if_exception_type((Exception, RateLimitError))
+    )
+    async def analyze_dashboard(self, screenshot_path: str, logs: List[str]) -> Dict[str, Any]:
+        with belief_scope("analyze_dashboard"):
+            import base64
+            with open(screenshot_path, "rb") as image_file:
+                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+
+            log_text = "\n".join(logs)
+            prompt = f"""
+            Analyze the attached dashboard screenshot and the following execution logs for health and visual issues.
+            
+            Logs:
+            {log_text}
+            
+            Provide the analysis in JSON format with the following structure:
+            {{
+                "status": "PASS" | "WARN" | "FAIL",
+                "summary": "Short summary of findings",
+                "issues": [
+                    {{
+                        "severity": "WARN" | "FAIL",
+                        "message": "Description of the issue",
+                        "location": "Optional location info (e.g. chart name)"
+                    }}
+                ]
+            }}
+            """
+            
+            logger.debug(f"[analyze_dashboard] Calling LLM with model: {self.default_model}")
+            try:
+                response = await self.client.chat.completions.create(
+                    model=self.default_model,
+                    messages=[
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": prompt},
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:image/jpeg;base64,{base64_image}"
+                                    }
+                                }
+                            ]
+                        }
+                    ],
+                    response_format={"type": "json_object"}
+                )
+                logger.debug(f"[analyze_dashboard] LLM Response: {response}")
+            except RateLimitError as e:
+                logger.warning(f"[analyze_dashboard] Rate limit hit: {str(e)}")
+                raise # tenacity will handle retry
+            except Exception as e:
+                logger.error(f"[analyze_dashboard] LLM call failed: {str(e)}")
+                raise
+            
+            if not response or not hasattr(response, 'choices') or not response.choices:
+                error_info = getattr(response, 'error', 'No choices in response')
+                logger.error(f"[analyze_dashboard] Invalid LLM response. Error info: {error_info}")
+                return {
+                    "status": "FAIL",
+                    "summary": f"Failed to get response from LLM: {error_info}",
+                    "issues": [{"severity": "FAIL", "message": "LLM provider returned empty or invalid response"}]
+                }
+
+            import json
+            result = json.loads(response.choices[0].message.content)
+            return result
+    # [/DEF:analyze_dashboard:Function]
+
+# [/DEF:LLMClient:Class]
+
+# [/DEF:backend.src.plugins.llm_analysis.service:Module]