# [DEF:backend/src/plugins/llm_analysis/service.py:Module] # @TIER: STANDARD # @SEMANTICS: service, llm, screenshot, playwright, openai # @PURPOSE: Services for LLM interaction and dashboard screenshots. # @LAYER: Domain # @RELATION: DEPENDS_ON -> playwright # @RELATION: DEPENDS_ON -> openai # @RELATION: DEPENDS_ON -> tenacity # @INVARIANT: Screenshots must be 1920px width and capture full page height. import asyncio import base64 import json import io from typing import List, Dict, Any from PIL import Image from playwright.async_api import async_playwright from openai import AsyncOpenAI, RateLimitError, AuthenticationError as OpenAIAuthenticationError from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception from .models import LLMProviderType from ...core.logger import belief_scope, logger from ...core.config_models import Environment # [DEF:ScreenshotService:Class] # @PURPOSE: Handles capturing screenshots of Superset dashboards. class ScreenshotService: # [DEF:ScreenshotService.__init__:Function] # @PURPOSE: Initializes the ScreenshotService with environment configuration. # @PRE: env is a valid Environment object. def __init__(self, env: Environment): self.env = env # [/DEF:ScreenshotService.__init__:Function] # [DEF:ScreenshotService.capture_dashboard:Function] # @PURPOSE: Captures a full-page screenshot of a dashboard using Playwright and CDP. # @PRE: dashboard_id is a valid string, output_path is a writable path. # @POST: Returns True if screenshot is saved successfully. # @SIDE_EFFECT: Launches a browser, performs UI login, switches tabs, and writes a PNG file. # @UX_STATE: [Navigating] -> Loading dashboard UI # @UX_STATE: [TabSwitching] -> Iterating through dashboard tabs to trigger lazy loading # @UX_STATE: [CalculatingHeight] -> Determining dashboard dimensions # @UX_STATE: [Capturing] -> Executing CDP screenshot async def capture_dashboard(self, dashboard_id: str, output_path: str) -> bool: with belief_scope("capture_dashboard", f"dashboard_id={dashboard_id}"): logger.info(f"Capturing screenshot for dashboard {dashboard_id}") async with async_playwright() as p: browser = await p.chromium.launch( headless=True, args=[ "--disable-blink-features=AutomationControlled", "--disable-infobars", "--no-sandbox" ] ) # Set a realistic user agent to avoid 403 Forbidden from OpenResty/WAF user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" # Construct base UI URL from environment (strip /api/v1 suffix) base_ui_url = self.env.url.rstrip("/") if base_ui_url.endswith("/api/v1"): base_ui_url = base_ui_url[:-len("/api/v1")] # Create browser context with realistic headers context = await browser.new_context( viewport={'width': 1280, 'height': 720}, user_agent=user_agent, extra_http_headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1" } ) logger.info("Browser context created successfully") page = await context.new_page() # Bypass navigator.webdriver detection await page.add_init_script("delete Object.getPrototypeOf(navigator).webdriver") # 1. Navigate to login page and authenticate login_url = f"{base_ui_url.rstrip('/')}/login/" logger.info(f"[DEBUG] Navigating to login page: {login_url}") response = await page.goto(login_url, wait_until="networkidle", timeout=60000) if response: logger.info(f"[DEBUG] Login page response status: {response.status}") # Wait for login form to be ready await page.wait_for_load_state("domcontentloaded") # More exhaustive list of selectors for various Superset versions/themes selectors = { "username": ['input[name="username"]', 'input#username', 'input[placeholder*="Username"]', 'input[type="text"]'], "password": ['input[name="password"]', 'input#password', 'input[placeholder*="Password"]', 'input[type="password"]'], "submit": ['button[type="submit"]', 'button#submit', '.btn-primary', 'input[type="submit"]'] } logger.info("[DEBUG] Attempting to find login form elements...") try: # Find and fill username u_selector = None for s in selectors["username"]: count = await page.locator(s).count() logger.info(f"[DEBUG] Selector '{s}': {count} elements found") if count > 0: u_selector = s break if not u_selector: # Log all input fields on the page for debugging all_inputs = await page.locator('input').all() logger.info(f"[DEBUG] Found {len(all_inputs)} input fields on page") for i, inp in enumerate(all_inputs[:5]): # Log first 5 inp_type = await inp.get_attribute('type') inp_name = await inp.get_attribute('name') inp_id = await inp.get_attribute('id') logger.info(f"[DEBUG] Input {i}: type={inp_type}, name={inp_name}, id={inp_id}") raise RuntimeError("Could not find username input field on login page") logger.info(f"[DEBUG] Filling username field with selector: {u_selector}") await page.fill(u_selector, self.env.username) # Find and fill password p_selector = None for s in selectors["password"]: if await page.locator(s).count() > 0: p_selector = s break if not p_selector: raise RuntimeError("Could not find password input field on login page") logger.info(f"[DEBUG] Filling password field with selector: {p_selector}") await page.fill(p_selector, self.env.password) # Click submit s_selector = selectors["submit"][0] for s in selectors["submit"]: if await page.locator(s).count() > 0: s_selector = s break logger.info(f"[DEBUG] Clicking submit button with selector: {s_selector}") await page.click(s_selector) # Wait for navigation after login await page.wait_for_load_state("networkidle", timeout=30000) # Check if login was successful if "/login" in page.url: # Check for error messages on page error_msg = await page.locator(".alert-danger, .error-message").text_content() if await page.locator(".alert-danger, .error-message").count() > 0 else "Unknown error" logger.error(f"[DEBUG] Login failed. Still on login page. Error: {error_msg}") debug_path = output_path.replace(".png", "_debug_failed_login.png") await page.screenshot(path=debug_path) raise RuntimeError(f"Login failed: {error_msg}. Debug screenshot saved to {debug_path}") logger.info(f"[DEBUG] Login successful. Current URL: {page.url}") # Check cookies after successful login page_cookies = await context.cookies() logger.info(f"[DEBUG] Cookies after login: {len(page_cookies)}") for c in page_cookies: logger.info(f"[DEBUG] Cookie: name={c['name']}, domain={c['domain']}, value={c.get('value', '')[:20]}...") except Exception as e: page_title = await page.title() logger.error(f"UI Login failed. Page title: {page_title}, URL: {page.url}, Error: {str(e)}") debug_path = output_path.replace(".png", "_debug_failed_login.png") await page.screenshot(path=debug_path) raise RuntimeError(f"Login failed: {str(e)}. Debug screenshot saved to {debug_path}") # 2. Navigate to dashboard # @UX_STATE: [Navigating] -> Loading dashboard UI dashboard_url = f"{base_ui_url.rstrip('/')}/superset/dashboard/{dashboard_id}/?standalone=true" if base_ui_url.startswith("https://") and dashboard_url.startswith("http://"): dashboard_url = dashboard_url.replace("http://", "https://") logger.info(f"[DEBUG] Navigating to dashboard: {dashboard_url}") # Use networkidle to ensure all initial assets are loaded response = await page.goto(dashboard_url, wait_until="networkidle", timeout=60000) if response: logger.info(f"[DEBUG] Dashboard navigation response status: {response.status}, URL: {response.url}") try: # Wait for the dashboard grid to be present await page.wait_for_selector('.dashboard-component, .dashboard-header, [data-test="dashboard-grid"]', timeout=30000) logger.info("[DEBUG] Dashboard container loaded") # Wait for charts to finish loading (Superset uses loading spinners/skeletons) # We wait until loading indicators disappear or a timeout occurs try: # Wait for loading indicators to disappear await page.wait_for_selector('.loading, .ant-skeleton, .spinner', state="hidden", timeout=60000) logger.info("[DEBUG] Loading indicators hidden") except Exception: logger.warning("[DEBUG] Timeout waiting for loading indicators to hide") # Wait for charts to actually render their content (e.g., ECharts, NVD3) # We look for common chart containers that should have content try: await page.wait_for_selector('.chart-container canvas, .slice_container svg, .superset-chart-canvas, .grid-content .chart-container', timeout=60000) logger.info("[DEBUG] Chart content detected") except Exception: logger.warning("[DEBUG] Timeout waiting for chart content") # Additional check: wait for all chart containers to have non-empty content logger.info("[DEBUG] Waiting for all charts to have rendered content...") await page.wait_for_function("""() => { const charts = document.querySelectorAll('.chart-container, .slice_container'); if (charts.length === 0) return true; // No charts to wait for // Check if all charts have rendered content (canvas, svg, or non-empty div) return Array.from(charts).every(chart => { const hasCanvas = chart.querySelector('canvas') !== null; const hasSvg = chart.querySelector('svg') !== null; const hasContent = chart.innerText.trim().length > 0 || chart.children.length > 0; return hasCanvas || hasSvg || hasContent; }); }""", timeout=60000) logger.info("[DEBUG] All charts have rendered content") # Scroll to bottom and back to top to trigger lazy loading of all charts logger.info("[DEBUG] Scrolling to trigger lazy loading...") await page.evaluate("""async () => { const delay = ms => new Promise(resolve => setTimeout(resolve, ms)); for (let i = 0; i < document.body.scrollHeight; i += 500) { window.scrollTo(0, i); await delay(100); } window.scrollTo(0, 0); await delay(500); }""") except Exception as e: logger.warning(f"[DEBUG] Dashboard content wait failed: {e}, proceeding anyway after delay") # Final stabilization delay - increased for complex dashboards logger.info("[DEBUG] Final stabilization delay...") await asyncio.sleep(15) # Logic to handle tabs and full-page capture try: # 1. Handle Tabs (Recursive switching) # @UX_STATE: [TabSwitching] -> Iterating through dashboard tabs to trigger lazy loading processed_tabs = set() async def switch_tabs(depth=0): if depth > 3: return # Limit recursion depth tab_selectors = [ '.ant-tabs-nav-list .ant-tabs-tab', '.dashboard-component-tabs .ant-tabs-tab', '[data-test="dashboard-component-tabs"] .ant-tabs-tab' ] found_tabs = [] for selector in tab_selectors: found_tabs = await page.locator(selector).all() if found_tabs: break if found_tabs: logger.info(f"[DEBUG][TabSwitching] Found {len(found_tabs)} tabs at depth {depth}") for i, tab in enumerate(found_tabs): try: tab_text = (await tab.inner_text()).strip() tab_id = f"{depth}_{i}_{tab_text}" if tab_id in processed_tabs: continue if await tab.is_visible(): logger.info(f"[DEBUG][TabSwitching] Switching to tab: {tab_text}") processed_tabs.add(tab_id) is_active = "ant-tabs-tab-active" in (await tab.get_attribute("class") or "") if not is_active: await tab.click() await asyncio.sleep(2) # Wait for content to render await switch_tabs(depth + 1) except Exception as tab_e: logger.warning(f"[DEBUG][TabSwitching] Failed to process tab {i}: {tab_e}") try: first_tab = found_tabs[0] if "ant-tabs-tab-active" not in (await first_tab.get_attribute("class") or ""): await first_tab.click() await asyncio.sleep(1) except Exception: pass await switch_tabs() # 2. Calculate full height for screenshot # @UX_STATE: [CalculatingHeight] -> Determining dashboard dimensions full_height = await page.evaluate("""() => { const body = document.body; const html = document.documentElement; const dashboardContent = document.querySelector('.dashboard-content'); return Math.max( body.scrollHeight, body.offsetHeight, html.clientHeight, html.scrollHeight, html.offsetHeight, dashboardContent ? dashboardContent.scrollHeight + 100 : 0 ); }""") logger.info(f"[DEBUG] Calculated full height: {full_height}") # DIAGNOSTIC: Count chart elements before resize chart_count_before = await page.evaluate("""() => { return { chartContainers: document.querySelectorAll('.chart-container, .slice_container').length, canvasElements: document.querySelectorAll('canvas').length, svgElements: document.querySelectorAll('.chart-container svg, .slice_container svg').length, visibleCharts: document.querySelectorAll('.chart-container:visible, .slice_container:visible').length }; }""") logger.info(f"[DIAGNOSTIC] Chart elements BEFORE viewport resize: {chart_count_before}") # DIAGNOSTIC: Capture pre-resize screenshot for comparison pre_resize_path = output_path.replace(".png", "_preresize.png") try: await page.screenshot(path=pre_resize_path, full_page=False, timeout=10000) import os pre_resize_size = os.path.getsize(pre_resize_path) if os.path.exists(pre_resize_path) else 0 logger.info(f"[DIAGNOSTIC] Pre-resize screenshot saved: {pre_resize_path} ({pre_resize_size} bytes)") except Exception as pre_e: logger.warning(f"[DIAGNOSTIC] Failed to capture pre-resize screenshot: {pre_e}") logger.info(f"[DIAGNOSTIC] Resizing viewport from current to 1920x{int(full_height)}") await page.set_viewport_size({"width": 1920, "height": int(full_height)}) # DIAGNOSTIC: Increased wait time and log timing logger.info("[DIAGNOSTIC] Waiting 10 seconds after viewport resize for re-render...") await asyncio.sleep(10) logger.info("[DIAGNOSTIC] Wait completed") # DIAGNOSTIC: Count chart elements after resize and wait chart_count_after = await page.evaluate("""() => { return { chartContainers: document.querySelectorAll('.chart-container, .slice_container').length, canvasElements: document.querySelectorAll('canvas').length, svgElements: document.querySelectorAll('.chart-container svg, .slice_container svg').length, visibleCharts: document.querySelectorAll('.chart-container:visible, .slice_container:visible').length }; }""") logger.info(f"[DIAGNOSTIC] Chart elements AFTER viewport resize + wait: {chart_count_after}") # DIAGNOSTIC: Check if any charts have error states chart_errors = await page.evaluate("""() => { const errors = []; document.querySelectorAll('.chart-container, .slice_container').forEach((chart, i) => { const errorEl = chart.querySelector('.error, .alert-danger, .ant-alert-error'); if (errorEl) { errors.push({index: i, text: errorEl.innerText.substring(0, 100)}); } }); return errors; }""") if chart_errors: logger.warning(f"[DIAGNOSTIC] Charts with error states detected: {chart_errors}") else: logger.info("[DIAGNOSTIC] No chart error states detected") # 3. Take screenshot using CDP to bypass Playwright's font loading wait # @UX_STATE: [Capturing] -> Executing CDP screenshot logger.info("[DEBUG] Attempting full-page screenshot via CDP...") cdp = await page.context.new_cdp_session(page) screenshot_data = await cdp.send("Page.captureScreenshot", { "format": "png", "fromSurface": True, "captureBeyondViewport": True }) image_data = base64.b64decode(screenshot_data["data"]) with open(output_path, 'wb') as f: f.write(image_data) # DIAGNOSTIC: Verify screenshot file import os final_size = os.path.getsize(output_path) if os.path.exists(output_path) else 0 logger.info(f"[DIAGNOSTIC] Final screenshot saved: {output_path}") logger.info(f"[DIAGNOSTIC] Final screenshot size: {final_size} bytes ({final_size / 1024:.2f} KB)") # DIAGNOSTIC: Get image dimensions try: with Image.open(output_path) as final_img: logger.info(f"[DIAGNOSTIC] Final screenshot dimensions: {final_img.width}x{final_img.height}") except Exception as img_err: logger.warning(f"[DIAGNOSTIC] Could not read final image dimensions: {img_err}") logger.info(f"Full-page screenshot saved to {output_path} (via CDP)") except Exception as e: logger.error(f"[DEBUG] Full-page/Tab capture failed: {e}") try: await page.screenshot(path=output_path, full_page=True, timeout=10000) except Exception as e2: logger.error(f"[DEBUG] Fallback screenshot also failed: {e2}") await page.screenshot(path=output_path, timeout=5000) await browser.close() return True # [/DEF:ScreenshotService.capture_dashboard:Function] # [/DEF:ScreenshotService:Class] # [DEF:LLMClient:Class] # @PURPOSE: Wrapper for LLM provider APIs. class LLMClient: # [DEF:LLMClient.__init__:Function] # @PURPOSE: Initializes the LLMClient with provider settings. # @PRE: api_key, base_url, and default_model are non-empty strings. def __init__(self, provider_type: LLMProviderType, api_key: str, base_url: str, default_model: str): self.provider_type = provider_type self.api_key = api_key self.base_url = base_url self.default_model = default_model # DEBUG: Log initialization parameters (without exposing full API key) logger.info("[LLMClient.__init__] Initializing LLM client:") logger.info(f"[LLMClient.__init__] Provider Type: {provider_type}") logger.info(f"[LLMClient.__init__] Base URL: {base_url}") logger.info(f"[LLMClient.__init__] Default Model: {default_model}") logger.info(f"[LLMClient.__init__] API Key (first 8 chars): {api_key[:8] if api_key and len(api_key) > 8 else 'EMPTY_OR_NONE'}...") logger.info(f"[LLMClient.__init__] API Key Length: {len(api_key) if api_key else 0}") self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) # [/DEF:LLMClient.__init__:Function] # [DEF:LLMClient.get_json_completion:Function] # @PURPOSE: Helper to handle LLM calls with JSON mode and fallback parsing. # @PRE: messages is a list of valid message dictionaries. # @POST: Returns a parsed JSON dictionary. # @SIDE_EFFECT: Calls external LLM API. def _should_retry(exception: Exception) -> bool: """Custom retry predicate that excludes authentication errors.""" # Don't retry on authentication errors if isinstance(exception, OpenAIAuthenticationError): return False # Retry on rate limit errors and other exceptions return isinstance(exception, (RateLimitError, Exception)) @retry( stop=stop_after_attempt(5), wait=wait_exponential(multiplier=2, min=5, max=60), retry=retry_if_exception(_should_retry), reraise=True ) async def get_json_completion(self, messages: List[Dict[str, Any]]) -> Dict[str, Any]: with belief_scope("get_json_completion"): response = None try: try: logger.info(f"[get_json_completion] Attempting LLM call with JSON mode for model: {self.default_model}") logger.info(f"[get_json_completion] Base URL being used: {self.base_url}") logger.info(f"[get_json_completion] Number of messages: {len(messages)}") logger.info(f"[get_json_completion] API Key present: {bool(self.api_key and len(self.api_key) > 0)}") response = await self.client.chat.completions.create( model=self.default_model, messages=messages, response_format={"type": "json_object"} ) except Exception as e: if "JSON mode is not enabled" in str(e) or "400" in str(e): logger.warning(f"[get_json_completion] JSON mode failed or not supported: {str(e)}. Falling back to plain text response.") response = await self.client.chat.completions.create( model=self.default_model, messages=messages ) else: raise e logger.debug(f"[get_json_completion] LLM Response: {response}") except OpenAIAuthenticationError as e: logger.error(f"[get_json_completion] Authentication error: {str(e)}") # Do not retry on auth errors - re-raise to stop retry raise except RateLimitError as e: logger.warning(f"[get_json_completion] Rate limit hit: {str(e)}") # Extract retry_delay from error metadata if available retry_delay = 5.0 # Default fallback try: # Based on logs, the raw response is in e.body or e.response.json() # The logs show 'metadata': {'raw': '...'} which suggests a proxy or specific client wrapper # Let's try to find the 'retryDelay' in the error message or response import re # Try to find "retryDelay": "XXs" in the string representation of the error error_str = str(e) match = re.search(r'"retryDelay":\s*"(\d+)s"', error_str) if match: retry_delay = float(match.group(1)) else: # Try to parse from response if it's a standard OpenAI-like error with body if hasattr(e, 'body') and isinstance(e.body, dict): # Some providers put it in details details = e.body.get('error', {}).get('details', []) for detail in details: if detail.get('@type') == 'type.googleapis.com/google.rpc.RetryInfo': delay_str = detail.get('retryDelay', '5s') retry_delay = float(delay_str.rstrip('s')) break except Exception as parse_e: logger.debug(f"[get_json_completion] Failed to parse retry delay: {parse_e}") # Add a small safety margin (0.5s) as requested wait_time = retry_delay + 0.5 logger.info(f"[get_json_completion] Waiting for {wait_time}s before retry...") await asyncio.sleep(wait_time) raise except Exception as e: logger.error(f"[get_json_completion] LLM call failed: {str(e)}") raise if not response or not hasattr(response, 'choices') or not response.choices: raise RuntimeError(f"Invalid LLM response: {response}") content = response.choices[0].message.content logger.debug(f"[get_json_completion] Raw content to parse: {content}") try: return json.loads(content) except json.JSONDecodeError: logger.warning("[get_json_completion] Failed to parse JSON directly, attempting to extract from code blocks") if "```json" in content: json_str = content.split("```json")[1].split("```")[0].strip() return json.loads(json_str) elif "```" in content: json_str = content.split("```")[1].split("```")[0].strip() return json.loads(json_str) else: raise # [/DEF:LLMClient.get_json_completion:Function] # [DEF:LLMClient.analyze_dashboard:Function] # @PURPOSE: Sends dashboard data (screenshot + logs) to LLM for health analysis. # @PRE: screenshot_path exists, logs is a list of strings. # @POST: Returns a structured analysis dictionary (status, summary, issues). # @SIDE_EFFECT: Reads screenshot file and calls external LLM API. async def analyze_dashboard(self, screenshot_path: str, logs: List[str]) -> Dict[str, Any]: with belief_scope("analyze_dashboard"): # Optimize image to reduce token count (US1 / T023) # Gemini/Gemma models have limits on input tokens, and large images contribute significantly. try: with Image.open(screenshot_path) as img: # Convert to RGB if necessary if img.mode in ("RGBA", "P"): img = img.convert("RGB") # Resize if too large (max 1024px width while maintaining aspect ratio) # We reduce width further to 1024px to stay within token limits for long dashboards max_width = 1024 if img.width > max_width or img.height > 2048: # Calculate scaling factor to fit within 1024x2048 scale = min(max_width / img.width, 2048 / img.height) if scale < 1.0: new_width = int(img.width * scale) new_height = int(img.height * scale) img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) logger.info(f"[analyze_dashboard] Resized image from {img.width}x{img.height} to {new_width}x{new_height}") # Compress and convert to base64 buffer = io.BytesIO() # Lower quality to 60% to further reduce payload size img.save(buffer, format="JPEG", quality=60, optimize=True) base_64_image = base64.b64encode(buffer.getvalue()).decode('utf-8') logger.info(f"[analyze_dashboard] Optimized image size: {len(buffer.getvalue()) / 1024:.2f} KB") except Exception as img_e: logger.warning(f"[analyze_dashboard] Image optimization failed: {img_e}. Using raw image.") with open(screenshot_path, "rb") as image_file: base_64_image = base64.b64encode(image_file.read()).decode('utf-8') log_text = "\n".join(logs) prompt = f""" Analyze the attached dashboard screenshot and the following execution logs for health and visual issues. Logs: {log_text} Provide the analysis in JSON format with the following structure: {{ "status": "PASS" | "WARN" | "FAIL", "summary": "Short summary of findings", "issues": [ {{ "severity": "WARN" | "FAIL", "message": "Description of the issue", "location": "Optional location info (e.g. chart name)" }} ] }} """ messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base_64_image}" } } ] } ] try: return await self.get_json_completion(messages) except Exception as e: logger.error(f"[analyze_dashboard] Failed to get analysis: {str(e)}") return { "status": "FAIL", "summary": f"Failed to get response from LLM: {str(e)}", "issues": [{"severity": "FAIL", "message": "LLM provider returned empty or invalid response"}] } # [/DEF:LLMClient.analyze_dashboard:Function] # [/DEF:LLMClient:Class] # [/DEF:backend/src/plugins/llm_analysis/service.py:Module]