# [DEF:backend.src.core.migration_engine:Module] # # @SEMANTICS: migration, engine, zip, yaml, transformation # @PURPOSE: Handles the interception and transformation of Superset asset ZIP archives. # @LAYER: Core # @RELATION: DEPENDS_ON -> PyYAML # # @INVARIANT: ZIP structure must be preserved after transformation. # [SECTION: IMPORTS] import zipfile import yaml import os import json import re import tempfile from pathlib import Path from typing import Dict, Optional, List from .logger import logger, belief_scope from src.core.mapping_service import IdMappingService from src.models.mapping import ResourceType # [/SECTION] # [DEF:MigrationEngine:Class] # @PURPOSE: Engine for transforming Superset export ZIPs. class MigrationEngine: # [DEF:__init__:Function] # @PURPOSE: Initializes the migration engine with optional ID mapping service. # @PARAM: mapping_service (Optional[IdMappingService]) - Used for resolving target environment integer IDs. def __init__(self, mapping_service: Optional[IdMappingService] = None): self.mapping_service = mapping_service # [/DEF:__init__:Function] # [DEF:transform_zip:Function] # @PURPOSE: Extracts ZIP, replaces database UUIDs in YAMLs, patches cross-filters, and re-packages. # @PARAM: zip_path (str) - Path to the source ZIP file. # @PARAM: output_path (str) - Path where the transformed ZIP will be saved. # @PARAM: db_mapping (Dict[str, str]) - Mapping of source UUID to target UUID. # @PARAM: strip_databases (bool) - Whether to remove the databases directory from the archive. # @PARAM: target_env_id (Optional[str]) - Used if fix_cross_filters is True to know which environment map to use. # @PARAM: fix_cross_filters (bool) - Whether to patch dashboard json_metadata. # @PRE: zip_path must point to a valid Superset export archive. # @POST: Transformed archive is saved to output_path. # @RETURN: bool - True if successful. def transform_zip(self, zip_path: str, output_path: str, db_mapping: Dict[str, str], strip_databases: bool = True, target_env_id: Optional[str] = None, fix_cross_filters: bool = False) -> bool: """ Transform a Superset export ZIP by replacing database UUIDs and optionally fixing cross-filters. """ with belief_scope("MigrationEngine.transform_zip"): with tempfile.TemporaryDirectory() as temp_dir_str: temp_dir = Path(temp_dir_str) try: # 1. Extract logger.info(f"[MigrationEngine.transform_zip][Action] Extracting ZIP: {zip_path}") with zipfile.ZipFile(zip_path, 'r') as zf: zf.extractall(temp_dir) # 2. Transform YAMLs (Databases) dataset_files = list(temp_dir.glob("**/datasets/**/*.yaml")) + list(temp_dir.glob("**/datasets/*.yaml")) dataset_files = list(set(dataset_files)) logger.info(f"[MigrationEngine.transform_zip][State] Found {len(dataset_files)} dataset files.") for ds_file in dataset_files: logger.info(f"[MigrationEngine.transform_zip][Action] Transforming dataset: {ds_file}") self._transform_yaml(ds_file, db_mapping) # 2.5 Patch Cross-Filters (Dashboards) if fix_cross_filters and self.mapping_service and target_env_id: dash_files = list(temp_dir.glob("**/dashboards/**/*.yaml")) + list(temp_dir.glob("**/dashboards/*.yaml")) dash_files = list(set(dash_files)) logger.info(f"[MigrationEngine.transform_zip][State] Found {len(dash_files)} dashboard files for patching.") # Gather all source UUID-to-ID mappings from the archive first source_id_to_uuid_map = self._extract_chart_uuids_from_archive(temp_dir) for dash_file in dash_files: logger.info(f"[MigrationEngine.transform_zip][Action] Patching dashboard: {dash_file}") self._patch_dashboard_metadata(dash_file, target_env_id, source_id_to_uuid_map) # 3. Re-package logger.info(f"[MigrationEngine.transform_zip][Action] Re-packaging ZIP to: {output_path} (strip_databases={strip_databases})") with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: for root, dirs, files in os.walk(temp_dir): rel_root = Path(root).relative_to(temp_dir) if strip_databases and "databases" in rel_root.parts: logger.info(f"[MigrationEngine.transform_zip][Action] Skipping file in databases directory: {rel_root}") continue for file in files: file_path = Path(root) / file arcname = file_path.relative_to(temp_dir) zf.write(file_path, arcname) return True except Exception as e: logger.error(f"[MigrationEngine.transform_zip][Coherence:Failed] Error transforming ZIP: {e}") return False # [/DEF:transform_zip:Function] # [DEF:_transform_yaml:Function] # @PURPOSE: Replaces database_uuid in a single YAML file. # @PARAM: file_path (Path) - Path to the YAML file. # @PARAM: db_mapping (Dict[str, str]) - UUID mapping dictionary. # @PRE: file_path must exist and be readable. # @POST: File is modified in-place if source UUID matches mapping. def _transform_yaml(self, file_path: Path, db_mapping: Dict[str, str]): with open(file_path, 'r') as f: data = yaml.safe_load(f) if not data: return # Superset dataset YAML structure: # database_uuid: ... source_uuid = data.get('database_uuid') if source_uuid in db_mapping: data['database_uuid'] = db_mapping[source_uuid] with open(file_path, 'w') as f: yaml.dump(data, f) # [/DEF:_transform_yaml:Function] # [DEF:_extract_chart_uuids_from_archive:Function] # @PURPOSE: Scans the unpacked ZIP to map local exported integer IDs back to their UUIDs. # @PARAM: temp_dir (Path) - Root dir of unpacked archive # @RETURN: Dict[int, str] - Mapping of source Integer ID to UUID. def _extract_chart_uuids_from_archive(self, temp_dir: Path) -> Dict[int, str]: # Implementation Note: This is a placeholder for the logic that extracts # actual Source IDs. In a real scenario, this involves parsing chart YAMLs # or manifesting the export metadata structure where source IDs are stored. # For simplicity in US1 MVP, we assume it's read from chart files if present. mapping = {} chart_files = list(temp_dir.glob("**/charts/**/*.yaml")) + list(temp_dir.glob("**/charts/*.yaml")) for cf in set(chart_files): try: with open(cf, 'r') as f: cdata = yaml.safe_load(f) if cdata and 'id' in cdata and 'uuid' in cdata: mapping[cdata['id']] = cdata['uuid'] except Exception: pass return mapping # [/DEF:_extract_chart_uuids_from_archive:Function] # [DEF:_patch_dashboard_metadata:Function] # @PURPOSE: Replaces integer IDs in json_metadata. # @PARAM: file_path (Path) # @PARAM: target_env_id (str) # @PARAM: source_map (Dict[int, str]) def _patch_dashboard_metadata(self, file_path: Path, target_env_id: str, source_map: Dict[int, str]): with belief_scope("MigrationEngine._patch_dashboard_metadata"): try: with open(file_path, 'r') as f: data = yaml.safe_load(f) if not data or 'json_metadata' not in data: return metadata_str = data['json_metadata'] if not metadata_str: return metadata = json.loads(metadata_str) modified = False # We need to deeply traverse and replace. For MVP, string replacement over the raw JSON is an option, # but careful dict traversal is safer. # Fetch target UUIDs for everything we know: uuids_needed = list(source_map.values()) target_ids = self.mapping_service.get_remote_ids_batch(target_env_id, ResourceType.CHART, uuids_needed) if not target_ids: logger.info("[MigrationEngine._patch_dashboard_metadata][Reflect] No remote target IDs found in mapping database.") return # Map Source Int -> Target Int source_to_target = {} missing_targets = [] for s_id, s_uuid in source_map.items(): if s_uuid in target_ids: source_to_target[s_id] = target_ids[s_uuid] else: missing_targets.append(s_id) if missing_targets: logger.warning(f"[MigrationEngine._patch_dashboard_metadata][Coherence:Recoverable] Missing target IDs for source IDs: {missing_targets}. Cross-filters for these IDs might break.") if not source_to_target: logger.info("[MigrationEngine._patch_dashboard_metadata][Reflect] No source IDs matched remotely. Skipping patch.") return # Complex metadata traversal would go here (e.g. for native_filter_configuration) # We use regex replacement over the string for safety over unknown nested dicts. new_metadata_str = metadata_str # Replace chartId and datasetId assignments explicitly. # Pattern: "datasetId": 42 or "chartId": 42 for s_id, t_id in source_to_target.items(): # Replace in native_filter_configuration targets new_metadata_str = re.sub(r'("datasetId"\s*:\s*)' + str(s_id) + r'(\b)', r'\g<1>' + str(t_id) + r'\g<2>', new_metadata_str) new_metadata_str = re.sub(r'("chartId"\s*:\s*)' + str(s_id) + r'(\b)', r'\g<1>' + str(t_id) + r'\g<2>', new_metadata_str) # Re-parse to validate valid JSON data['json_metadata'] = json.dumps(json.loads(new_metadata_str)) with open(file_path, 'w') as f: yaml.dump(data, f) logger.info(f"[MigrationEngine._patch_dashboard_metadata][Reason] Re-serialized modified JSON metadata for dashboard.") except Exception as e: logger.error(f"[MigrationEngine._patch_dashboard_metadata][Coherence:Failed] Metadata patch failed: {e}") # [/DEF:_patch_dashboard_metadata:Function] # [/DEF:MigrationEngine:Class] # [/DEF:backend.src.core.migration_engine:Module]