MigrationEngine contracts. It avoids Tautologies by cleanly substituting IdMappingService without mocking the engine itself. Cross-filter parsing asserts against hard-coded, predefined validation dictionaries (no Logic Mirroring). It successfully addresses @PRE negative cases (e.g. invalid zip paths, missing YAMLs) and rigorously validates @POST file transformations (e.g. in-place UUID substitutions and archive reconstruction)." }
235 lines
11 KiB
Python
235 lines
11 KiB
Python
# [DEF:backend.src.core.mapping_service:Module]
|
|
#
|
|
# @TIER: CRITICAL
|
|
# @SEMANTICS: mapping, ids, synchronization, environments, cross-filters
|
|
# @PURPOSE: Service for tracking and synchronizing Superset Resource IDs (UUID <-> Integer ID)
|
|
# @LAYER: Core
|
|
# @RELATION: DEPENDS_ON -> backend.src.models.mapping (ResourceMapping, ResourceType)
|
|
# @RELATION: DEPENDS_ON -> backend.src.core.logger
|
|
# @TEST_DATA: mock_superset_resources -> {'chart': [{'id': 42, 'uuid': '1234', 'slice_name': 'test'}], 'dataset': [{'id': 99, 'uuid': '5678', 'table_name': 'data'}]}
|
|
#
|
|
# @INVARIANT: sync_environment must handle remote API failures gracefully.
|
|
|
|
# [SECTION: IMPORTS]
|
|
from typing import Dict, List, Optional
|
|
from datetime import datetime, timezone
|
|
from sqlalchemy.orm import Session
|
|
from apscheduler.schedulers.background import BackgroundScheduler
|
|
from apscheduler.triggers.cron import CronTrigger
|
|
from src.models.mapping import ResourceMapping, ResourceType
|
|
from src.core.logger import logger, belief_scope
|
|
# [/SECTION]
|
|
|
|
# [DEF:IdMappingService:Class]
|
|
# @TIER: CRITICAL
|
|
# @PURPOSE: Service handling the cataloging and retrieval of remote Superset Integer IDs.
|
|
class IdMappingService:
|
|
|
|
# [DEF:__init__:Function]
|
|
# @PURPOSE: Initializes the mapping service.
|
|
def __init__(self, db_session: Session):
|
|
self.db = db_session
|
|
self.scheduler = BackgroundScheduler()
|
|
self._sync_job = None
|
|
# [/DEF:__init__:Function]
|
|
|
|
# [DEF:start_scheduler:Function]
|
|
# @PURPOSE: Starts the background scheduler with a given cron string.
|
|
# @PARAM: cron_string (str) - Cron expression for the sync interval.
|
|
# @PARAM: environments (List[str]) - List of environment IDs to sync.
|
|
# @PARAM: superset_client_factory - Function to get a client for an environment.
|
|
def start_scheduler(self, cron_string: str, environments: List[str], superset_client_factory):
|
|
with belief_scope("IdMappingService.start_scheduler"):
|
|
if self._sync_job:
|
|
self.scheduler.remove_job(self._sync_job.id)
|
|
logger.info("[IdMappingService.start_scheduler][Reflect] Removed existing sync job.")
|
|
|
|
def sync_all():
|
|
for env_id in environments:
|
|
client = superset_client_factory(env_id)
|
|
if client:
|
|
self.sync_environment(env_id, client)
|
|
|
|
self._sync_job = self.scheduler.add_job(
|
|
sync_all,
|
|
CronTrigger.from_crontab(cron_string),
|
|
id='id_mapping_sync_job',
|
|
replace_existing=True
|
|
)
|
|
|
|
if not self.scheduler.running:
|
|
self.scheduler.start()
|
|
logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Started background scheduler with cron: {cron_string}")
|
|
else:
|
|
logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Updated background scheduler with cron: {cron_string}")
|
|
# [/DEF:start_scheduler:Function]
|
|
|
|
# [DEF:sync_environment:Function]
|
|
# @PURPOSE: Fully synchronizes mapping for a specific environment.
|
|
# @PARAM: environment_id (str) - Target environment ID.
|
|
# @PARAM: superset_client - Instance capable of hitting the Superset API.
|
|
# @PRE: environment_id exists in the database.
|
|
# @POST: ResourceMapping records for the environment are created or updated.
|
|
def sync_environment(self, environment_id: str, superset_client, incremental: bool = False) -> None:
|
|
"""
|
|
Polls the Superset APIs for the target environment and updates the local mapping table.
|
|
If incremental=True, only fetches items changed since the max last_synced_at date.
|
|
"""
|
|
with belief_scope("IdMappingService.sync_environment"):
|
|
logger.info(f"[IdMappingService.sync_environment][Action] Starting sync for environment {environment_id} (incremental={incremental})")
|
|
|
|
# Implementation Note: In a real scenario, superset_client needs to be an instance
|
|
# capable of auth & iteration over /api/v1/chart/, /api/v1/dataset/, /api/v1/dashboard/
|
|
# Here we structure the logic according to the spec.
|
|
|
|
types_to_poll = [
|
|
(ResourceType.CHART, "chart", "slice_name"),
|
|
(ResourceType.DATASET, "dataset", "table_name"),
|
|
(ResourceType.DASHBOARD, "dashboard", "slug") # Note: dashboard slug or dashboard_title
|
|
]
|
|
|
|
total_synced = 0
|
|
total_deleted = 0
|
|
try:
|
|
for res_enum, endpoint, name_field in types_to_poll:
|
|
logger.debug(f"[IdMappingService.sync_environment][Explore] Polling {endpoint} endpoint")
|
|
|
|
# Simulated API Fetch (Would be: superset_client.get(f"/api/v1/{endpoint}/")... )
|
|
# This relies on the superset API structure, e.g. { "result": [{"id": 1, "uuid": "...", name_field: "..."}] }
|
|
# We assume superset_client provides a generic method to fetch all pages.
|
|
|
|
try:
|
|
since_dttm = None
|
|
if incremental:
|
|
from sqlalchemy.sql import func
|
|
max_date = self.db.query(func.max(ResourceMapping.last_synced_at)).filter(
|
|
ResourceMapping.environment_id == environment_id,
|
|
ResourceMapping.resource_type == res_enum
|
|
).scalar()
|
|
|
|
if max_date:
|
|
# We subtract a bit for safety overlap
|
|
from datetime import timedelta
|
|
since_dttm = max_date - timedelta(minutes=5)
|
|
logger.debug(f"[IdMappingService.sync_environment] Incremental sync since {since_dttm}")
|
|
|
|
resources = superset_client.get_all_resources(endpoint, since_dttm=since_dttm)
|
|
|
|
# Track which UUIDs we see in this sync cycle
|
|
synced_uuids = set()
|
|
|
|
for res in resources:
|
|
res_uuid = res.get("uuid")
|
|
raw_id = res.get("id")
|
|
res_name = res.get(name_field)
|
|
|
|
if not res_uuid or raw_id is None:
|
|
continue
|
|
|
|
synced_uuids.add(res_uuid)
|
|
res_id = str(raw_id) # Store as string
|
|
|
|
# Upsert Logic
|
|
mapping = self.db.query(ResourceMapping).filter_by(
|
|
environment_id=environment_id,
|
|
resource_type=res_enum,
|
|
uuid=res_uuid
|
|
).first()
|
|
|
|
if mapping:
|
|
mapping.remote_integer_id = res_id
|
|
mapping.resource_name = res_name
|
|
mapping.last_synced_at = datetime.now(timezone.utc)
|
|
else:
|
|
new_mapping = ResourceMapping(
|
|
environment_id=environment_id,
|
|
resource_type=res_enum,
|
|
uuid=res_uuid,
|
|
remote_integer_id=res_id,
|
|
resource_name=res_name,
|
|
last_synced_at=datetime.now(timezone.utc)
|
|
)
|
|
self.db.add(new_mapping)
|
|
|
|
total_synced += 1
|
|
|
|
# Delete stale mappings: rows for this env+type whose UUID
|
|
# was NOT returned by the API (resource was deleted remotely)
|
|
# We only do this on full syncs, because incremental syncs don't return all UUIDs
|
|
if not incremental:
|
|
stale_query = self.db.query(ResourceMapping).filter(
|
|
ResourceMapping.environment_id == environment_id,
|
|
ResourceMapping.resource_type == res_enum,
|
|
)
|
|
if synced_uuids:
|
|
stale_query = stale_query.filter(
|
|
ResourceMapping.uuid.notin_(synced_uuids)
|
|
)
|
|
deleted = stale_query.delete(synchronize_session="fetch")
|
|
if deleted:
|
|
total_deleted += deleted
|
|
logger.info(f"[IdMappingService.sync_environment][Action] Removed {deleted} stale {endpoint} mapping(s) for {environment_id}")
|
|
|
|
except Exception as loop_e:
|
|
logger.error(f"[IdMappingService.sync_environment][Reason] Error polling {endpoint}: {loop_e}")
|
|
# Continue to next resource type instead of blowing up the whole sync
|
|
|
|
self.db.commit()
|
|
logger.info(f"[IdMappingService.sync_environment][Coherence:OK] Successfully synced {total_synced} items and deleted {total_deleted} stale items.")
|
|
|
|
except Exception as e:
|
|
self.db.rollback()
|
|
logger.error(f"[IdMappingService.sync_environment][Coherence:Failed] Critical sync failure: {e}")
|
|
raise
|
|
# [/DEF:sync_environment:Function]
|
|
|
|
# [DEF:get_remote_id:Function]
|
|
# @PURPOSE: Retrieves the remote integer ID for a given universal UUID.
|
|
# @PARAM: environment_id (str)
|
|
# @PARAM: resource_type (ResourceType)
|
|
# @PARAM: uuid (str)
|
|
# @RETURN: Optional[int]
|
|
def get_remote_id(self, environment_id: str, resource_type: ResourceType, uuid: str) -> Optional[int]:
|
|
mapping = self.db.query(ResourceMapping).filter_by(
|
|
environment_id=environment_id,
|
|
resource_type=resource_type,
|
|
uuid=uuid
|
|
).first()
|
|
|
|
if mapping:
|
|
try:
|
|
return int(mapping.remote_integer_id)
|
|
except ValueError:
|
|
return None
|
|
return None
|
|
# [/DEF:get_remote_id:Function]
|
|
|
|
# [DEF:get_remote_ids_batch:Function]
|
|
# @PURPOSE: Retrieves remote integer IDs for a list of universal UUIDs efficiently.
|
|
# @PARAM: environment_id (str)
|
|
# @PARAM: resource_type (ResourceType)
|
|
# @PARAM: uuids (List[str])
|
|
# @RETURN: Dict[str, int] - Mapping of UUID -> Integer ID
|
|
def get_remote_ids_batch(self, environment_id: str, resource_type: ResourceType, uuids: List[str]) -> Dict[str, int]:
|
|
if not uuids:
|
|
return {}
|
|
|
|
mappings = self.db.query(ResourceMapping).filter(
|
|
ResourceMapping.environment_id == environment_id,
|
|
ResourceMapping.resource_type == resource_type,
|
|
ResourceMapping.uuid.in_(uuids)
|
|
).all()
|
|
|
|
result = {}
|
|
for m in mappings:
|
|
try:
|
|
result[m.uuid] = int(m.remote_integer_id)
|
|
except ValueError:
|
|
pass
|
|
|
|
return result
|
|
# [/DEF:get_remote_ids_batch:Function]
|
|
|
|
# [/DEF:IdMappingService:Class]
|
|
# [/DEF:backend.src.core.mapping_service:Module]
|