# [DEF:backend.src.core.migration.archive_parser:Module] # @TIER: STANDARD # @SEMANTICS: migration, zip, parser, yaml, metadata # @PURPOSE: Parse Superset export ZIP archives into normalized object catalogs for diffing. # @LAYER: Core # @RELATION: DEPENDS_ON -> backend.src.core.logger # @INVARIANT: Parsing is read-only and never mutates archive files. import json import tempfile import zipfile from pathlib import Path from typing import Any, Dict, List, Optional import yaml from ..logger import logger, belief_scope # [DEF:MigrationArchiveParser:Class] # @PURPOSE: Extract normalized dashboards/charts/datasets metadata from ZIP archives. class MigrationArchiveParser: # [DEF:extract_objects_from_zip:Function] # @PURPOSE: Extract object catalogs from Superset archive. # @PRE: zip_path points to a valid readable ZIP. # @POST: Returns object lists grouped by resource type. # @RETURN: Dict[str, List[Dict[str, Any]]] def extract_objects_from_zip(self, zip_path: str) -> Dict[str, List[Dict[str, Any]]]: with belief_scope("MigrationArchiveParser.extract_objects_from_zip"): result: Dict[str, List[Dict[str, Any]]] = { "dashboards": [], "charts": [], "datasets": [], } with tempfile.TemporaryDirectory() as temp_dir_str: temp_dir = Path(temp_dir_str) with zipfile.ZipFile(zip_path, "r") as zip_file: zip_file.extractall(temp_dir) result["dashboards"] = self._collect_yaml_objects(temp_dir, "dashboards") result["charts"] = self._collect_yaml_objects(temp_dir, "charts") result["datasets"] = self._collect_yaml_objects(temp_dir, "datasets") return result # [/DEF:extract_objects_from_zip:Function] # [DEF:_collect_yaml_objects:Function] # @PURPOSE: Read and normalize YAML manifests for one object type. # @PRE: object_type is one of dashboards/charts/datasets. # @POST: Returns only valid normalized objects. def _collect_yaml_objects(self, root_dir: Path, object_type: str) -> List[Dict[str, Any]]: with belief_scope("MigrationArchiveParser._collect_yaml_objects"): files = list(root_dir.glob(f"**/{object_type}/**/*.yaml")) + list(root_dir.glob(f"**/{object_type}/*.yaml")) objects: List[Dict[str, Any]] = [] for file_path in set(files): try: with open(file_path, "r") as file_obj: payload = yaml.safe_load(file_obj) or {} normalized = self._normalize_object_payload(payload, object_type) if normalized: objects.append(normalized) except Exception as exc: logger.reflect( "[MigrationArchiveParser._collect_yaml_objects][REFLECT] skip_invalid_yaml path=%s error=%s", file_path, exc, ) return objects # [/DEF:_collect_yaml_objects:Function] # [DEF:_normalize_object_payload:Function] # @PURPOSE: Convert raw YAML payload to stable diff signature shape. # @PRE: payload is parsed YAML mapping. # @POST: Returns normalized descriptor with `uuid`, `title`, and `signature`. def _normalize_object_payload(self, payload: Dict[str, Any], object_type: str) -> Optional[Dict[str, Any]]: with belief_scope("MigrationArchiveParser._normalize_object_payload"): if not isinstance(payload, dict): return None uuid = payload.get("uuid") if not uuid: return None if object_type == "dashboards": title = payload.get("dashboard_title") or payload.get("title") signature = { "title": title, "slug": payload.get("slug"), "position_json": payload.get("position_json"), "json_metadata": payload.get("json_metadata"), "description": payload.get("description"), "owners": payload.get("owners"), } return { "uuid": str(uuid), "title": title or f"Dashboard {uuid}", "signature": json.dumps(signature, sort_keys=True, default=str), "owners": payload.get("owners") or [], } if object_type == "charts": title = payload.get("slice_name") or payload.get("name") signature = { "title": title, "viz_type": payload.get("viz_type"), "params": payload.get("params"), "query_context": payload.get("query_context"), "datasource_uuid": payload.get("datasource_uuid"), "dataset_uuid": payload.get("dataset_uuid"), } return { "uuid": str(uuid), "title": title or f"Chart {uuid}", "signature": json.dumps(signature, sort_keys=True, default=str), "dataset_uuid": payload.get("datasource_uuid") or payload.get("dataset_uuid"), } if object_type == "datasets": title = payload.get("table_name") or payload.get("name") signature = { "title": title, "schema": payload.get("schema"), "database_uuid": payload.get("database_uuid"), "sql": payload.get("sql"), "columns": payload.get("columns"), "metrics": payload.get("metrics"), } return { "uuid": str(uuid), "title": title or f"Dataset {uuid}", "signature": json.dumps(signature, sort_keys=True, default=str), "database_uuid": payload.get("database_uuid"), } return None # [/DEF:_normalize_object_payload:Function] # [/DEF:MigrationArchiveParser:Class] # [/DEF:backend.src.core.migration.archive_parser:Module]