Files
ss-tools/backend/src/core/migration/archive_parser.py
2026-02-27 20:48:18 +03:00

140 lines
6.1 KiB
Python

# [DEF:backend.src.core.migration.archive_parser:Module]
# @TIER: STANDARD
# @SEMANTICS: migration, zip, parser, yaml, metadata
# @PURPOSE: Parse Superset export ZIP archives into normalized object catalogs for diffing.
# @LAYER: Core
# @RELATION: DEPENDS_ON -> backend.src.core.logger
# @INVARIANT: Parsing is read-only and never mutates archive files.
import json
import tempfile
import zipfile
from pathlib import Path
from typing import Any, Dict, List, Optional
import yaml
from ..logger import logger, belief_scope
# [DEF:MigrationArchiveParser:Class]
# @PURPOSE: Extract normalized dashboards/charts/datasets metadata from ZIP archives.
class MigrationArchiveParser:
# [DEF:extract_objects_from_zip:Function]
# @PURPOSE: Extract object catalogs from Superset archive.
# @PRE: zip_path points to a valid readable ZIP.
# @POST: Returns object lists grouped by resource type.
# @RETURN: Dict[str, List[Dict[str, Any]]]
def extract_objects_from_zip(self, zip_path: str) -> Dict[str, List[Dict[str, Any]]]:
with belief_scope("MigrationArchiveParser.extract_objects_from_zip"):
result: Dict[str, List[Dict[str, Any]]] = {
"dashboards": [],
"charts": [],
"datasets": [],
}
with tempfile.TemporaryDirectory() as temp_dir_str:
temp_dir = Path(temp_dir_str)
with zipfile.ZipFile(zip_path, "r") as zip_file:
zip_file.extractall(temp_dir)
result["dashboards"] = self._collect_yaml_objects(temp_dir, "dashboards")
result["charts"] = self._collect_yaml_objects(temp_dir, "charts")
result["datasets"] = self._collect_yaml_objects(temp_dir, "datasets")
return result
# [/DEF:extract_objects_from_zip:Function]
# [DEF:_collect_yaml_objects:Function]
# @PURPOSE: Read and normalize YAML manifests for one object type.
# @PRE: object_type is one of dashboards/charts/datasets.
# @POST: Returns only valid normalized objects.
def _collect_yaml_objects(self, root_dir: Path, object_type: str) -> List[Dict[str, Any]]:
with belief_scope("MigrationArchiveParser._collect_yaml_objects"):
files = list(root_dir.glob(f"**/{object_type}/**/*.yaml")) + list(root_dir.glob(f"**/{object_type}/*.yaml"))
objects: List[Dict[str, Any]] = []
for file_path in set(files):
try:
with open(file_path, "r") as file_obj:
payload = yaml.safe_load(file_obj) or {}
normalized = self._normalize_object_payload(payload, object_type)
if normalized:
objects.append(normalized)
except Exception as exc:
logger.reflect(
"[MigrationArchiveParser._collect_yaml_objects][REFLECT] skip_invalid_yaml path=%s error=%s",
file_path,
exc,
)
return objects
# [/DEF:_collect_yaml_objects:Function]
# [DEF:_normalize_object_payload:Function]
# @PURPOSE: Convert raw YAML payload to stable diff signature shape.
# @PRE: payload is parsed YAML mapping.
# @POST: Returns normalized descriptor with `uuid`, `title`, and `signature`.
def _normalize_object_payload(self, payload: Dict[str, Any], object_type: str) -> Optional[Dict[str, Any]]:
with belief_scope("MigrationArchiveParser._normalize_object_payload"):
if not isinstance(payload, dict):
return None
uuid = payload.get("uuid")
if not uuid:
return None
if object_type == "dashboards":
title = payload.get("dashboard_title") or payload.get("title")
signature = {
"title": title,
"slug": payload.get("slug"),
"position_json": payload.get("position_json"),
"json_metadata": payload.get("json_metadata"),
"description": payload.get("description"),
"owners": payload.get("owners"),
}
return {
"uuid": str(uuid),
"title": title or f"Dashboard {uuid}",
"signature": json.dumps(signature, sort_keys=True, default=str),
"owners": payload.get("owners") or [],
}
if object_type == "charts":
title = payload.get("slice_name") or payload.get("name")
signature = {
"title": title,
"viz_type": payload.get("viz_type"),
"params": payload.get("params"),
"query_context": payload.get("query_context"),
"datasource_uuid": payload.get("datasource_uuid"),
"dataset_uuid": payload.get("dataset_uuid"),
}
return {
"uuid": str(uuid),
"title": title or f"Chart {uuid}",
"signature": json.dumps(signature, sort_keys=True, default=str),
"dataset_uuid": payload.get("datasource_uuid") or payload.get("dataset_uuid"),
}
if object_type == "datasets":
title = payload.get("table_name") or payload.get("name")
signature = {
"title": title,
"schema": payload.get("schema"),
"database_uuid": payload.get("database_uuid"),
"sql": payload.get("sql"),
"columns": payload.get("columns"),
"metrics": payload.get("metrics"),
}
return {
"uuid": str(uuid),
"title": title or f"Dataset {uuid}",
"signature": json.dumps(signature, sort_keys=True, default=str),
"database_uuid": payload.get("database_uuid"),
}
return None
# [/DEF:_normalize_object_payload:Function]
# [/DEF:MigrationArchiveParser:Class]
# [/DEF:backend.src.core.migration.archive_parser:Module]