Files
ss-tools/backend/src/core/mapping_service.py
busya 99f19ac305 { "verdict": "APPROVED", "rejection_reason": "NONE", "audit_details": { "target_invoked": true, "pre_conditions_tested": true, "post_conditions_tested": true, "test_data_used": true }, "feedback": "The test suite robustly verifies the
MigrationEngine
 contracts. It avoids Tautologies by cleanly substituting IdMappingService without mocking the engine itself. Cross-filter parsing asserts against hard-coded, predefined validation dictionaries (no Logic Mirroring). It successfully addresses @PRE negative cases (e.g. invalid zip paths, missing YAMLs) and rigorously validates @POST file transformations (e.g. in-place UUID substitutions and archive reconstruction)." }
2026-02-25 17:47:55 +03:00

235 lines
11 KiB
Python

# [DEF:backend.src.core.mapping_service:Module]
#
# @TIER: CRITICAL
# @SEMANTICS: mapping, ids, synchronization, environments, cross-filters
# @PURPOSE: Service for tracking and synchronizing Superset Resource IDs (UUID <-> Integer ID)
# @LAYER: Core
# @RELATION: DEPENDS_ON -> backend.src.models.mapping (ResourceMapping, ResourceType)
# @RELATION: DEPENDS_ON -> backend.src.core.logger
# @TEST_DATA: mock_superset_resources -> {'chart': [{'id': 42, 'uuid': '1234', 'slice_name': 'test'}], 'dataset': [{'id': 99, 'uuid': '5678', 'table_name': 'data'}]}
#
# @INVARIANT: sync_environment must handle remote API failures gracefully.
# [SECTION: IMPORTS]
from typing import Dict, List, Optional
from datetime import datetime, timezone
from sqlalchemy.orm import Session
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from src.models.mapping import ResourceMapping, ResourceType
from src.core.logger import logger, belief_scope
# [/SECTION]
# [DEF:IdMappingService:Class]
# @TIER: CRITICAL
# @PURPOSE: Service handling the cataloging and retrieval of remote Superset Integer IDs.
class IdMappingService:
# [DEF:__init__:Function]
# @PURPOSE: Initializes the mapping service.
def __init__(self, db_session: Session):
self.db = db_session
self.scheduler = BackgroundScheduler()
self._sync_job = None
# [/DEF:__init__:Function]
# [DEF:start_scheduler:Function]
# @PURPOSE: Starts the background scheduler with a given cron string.
# @PARAM: cron_string (str) - Cron expression for the sync interval.
# @PARAM: environments (List[str]) - List of environment IDs to sync.
# @PARAM: superset_client_factory - Function to get a client for an environment.
def start_scheduler(self, cron_string: str, environments: List[str], superset_client_factory):
with belief_scope("IdMappingService.start_scheduler"):
if self._sync_job:
self.scheduler.remove_job(self._sync_job.id)
logger.info("[IdMappingService.start_scheduler][Reflect] Removed existing sync job.")
def sync_all():
for env_id in environments:
client = superset_client_factory(env_id)
if client:
self.sync_environment(env_id, client)
self._sync_job = self.scheduler.add_job(
sync_all,
CronTrigger.from_crontab(cron_string),
id='id_mapping_sync_job',
replace_existing=True
)
if not self.scheduler.running:
self.scheduler.start()
logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Started background scheduler with cron: {cron_string}")
else:
logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Updated background scheduler with cron: {cron_string}")
# [/DEF:start_scheduler:Function]
# [DEF:sync_environment:Function]
# @PURPOSE: Fully synchronizes mapping for a specific environment.
# @PARAM: environment_id (str) - Target environment ID.
# @PARAM: superset_client - Instance capable of hitting the Superset API.
# @PRE: environment_id exists in the database.
# @POST: ResourceMapping records for the environment are created or updated.
def sync_environment(self, environment_id: str, superset_client, incremental: bool = False) -> None:
"""
Polls the Superset APIs for the target environment and updates the local mapping table.
If incremental=True, only fetches items changed since the max last_synced_at date.
"""
with belief_scope("IdMappingService.sync_environment"):
logger.info(f"[IdMappingService.sync_environment][Action] Starting sync for environment {environment_id} (incremental={incremental})")
# Implementation Note: In a real scenario, superset_client needs to be an instance
# capable of auth & iteration over /api/v1/chart/, /api/v1/dataset/, /api/v1/dashboard/
# Here we structure the logic according to the spec.
types_to_poll = [
(ResourceType.CHART, "chart", "slice_name"),
(ResourceType.DATASET, "dataset", "table_name"),
(ResourceType.DASHBOARD, "dashboard", "slug") # Note: dashboard slug or dashboard_title
]
total_synced = 0
total_deleted = 0
try:
for res_enum, endpoint, name_field in types_to_poll:
logger.debug(f"[IdMappingService.sync_environment][Explore] Polling {endpoint} endpoint")
# Simulated API Fetch (Would be: superset_client.get(f"/api/v1/{endpoint}/")... )
# This relies on the superset API structure, e.g. { "result": [{"id": 1, "uuid": "...", name_field: "..."}] }
# We assume superset_client provides a generic method to fetch all pages.
try:
since_dttm = None
if incremental:
from sqlalchemy.sql import func
max_date = self.db.query(func.max(ResourceMapping.last_synced_at)).filter(
ResourceMapping.environment_id == environment_id,
ResourceMapping.resource_type == res_enum
).scalar()
if max_date:
# We subtract a bit for safety overlap
from datetime import timedelta
since_dttm = max_date - timedelta(minutes=5)
logger.debug(f"[IdMappingService.sync_environment] Incremental sync since {since_dttm}")
resources = superset_client.get_all_resources(endpoint, since_dttm=since_dttm)
# Track which UUIDs we see in this sync cycle
synced_uuids = set()
for res in resources:
res_uuid = res.get("uuid")
raw_id = res.get("id")
res_name = res.get(name_field)
if not res_uuid or raw_id is None:
continue
synced_uuids.add(res_uuid)
res_id = str(raw_id) # Store as string
# Upsert Logic
mapping = self.db.query(ResourceMapping).filter_by(
environment_id=environment_id,
resource_type=res_enum,
uuid=res_uuid
).first()
if mapping:
mapping.remote_integer_id = res_id
mapping.resource_name = res_name
mapping.last_synced_at = datetime.now(timezone.utc)
else:
new_mapping = ResourceMapping(
environment_id=environment_id,
resource_type=res_enum,
uuid=res_uuid,
remote_integer_id=res_id,
resource_name=res_name,
last_synced_at=datetime.now(timezone.utc)
)
self.db.add(new_mapping)
total_synced += 1
# Delete stale mappings: rows for this env+type whose UUID
# was NOT returned by the API (resource was deleted remotely)
# We only do this on full syncs, because incremental syncs don't return all UUIDs
if not incremental:
stale_query = self.db.query(ResourceMapping).filter(
ResourceMapping.environment_id == environment_id,
ResourceMapping.resource_type == res_enum,
)
if synced_uuids:
stale_query = stale_query.filter(
ResourceMapping.uuid.notin_(synced_uuids)
)
deleted = stale_query.delete(synchronize_session="fetch")
if deleted:
total_deleted += deleted
logger.info(f"[IdMappingService.sync_environment][Action] Removed {deleted} stale {endpoint} mapping(s) for {environment_id}")
except Exception as loop_e:
logger.error(f"[IdMappingService.sync_environment][Reason] Error polling {endpoint}: {loop_e}")
# Continue to next resource type instead of blowing up the whole sync
self.db.commit()
logger.info(f"[IdMappingService.sync_environment][Coherence:OK] Successfully synced {total_synced} items and deleted {total_deleted} stale items.")
except Exception as e:
self.db.rollback()
logger.error(f"[IdMappingService.sync_environment][Coherence:Failed] Critical sync failure: {e}")
raise
# [/DEF:sync_environment:Function]
# [DEF:get_remote_id:Function]
# @PURPOSE: Retrieves the remote integer ID for a given universal UUID.
# @PARAM: environment_id (str)
# @PARAM: resource_type (ResourceType)
# @PARAM: uuid (str)
# @RETURN: Optional[int]
def get_remote_id(self, environment_id: str, resource_type: ResourceType, uuid: str) -> Optional[int]:
mapping = self.db.query(ResourceMapping).filter_by(
environment_id=environment_id,
resource_type=resource_type,
uuid=uuid
).first()
if mapping:
try:
return int(mapping.remote_integer_id)
except ValueError:
return None
return None
# [/DEF:get_remote_id:Function]
# [DEF:get_remote_ids_batch:Function]
# @PURPOSE: Retrieves remote integer IDs for a list of universal UUIDs efficiently.
# @PARAM: environment_id (str)
# @PARAM: resource_type (ResourceType)
# @PARAM: uuids (List[str])
# @RETURN: Dict[str, int] - Mapping of UUID -> Integer ID
def get_remote_ids_batch(self, environment_id: str, resource_type: ResourceType, uuids: List[str]) -> Dict[str, int]:
if not uuids:
return {}
mappings = self.db.query(ResourceMapping).filter(
ResourceMapping.environment_id == environment_id,
ResourceMapping.resource_type == resource_type,
ResourceMapping.uuid.in_(uuids)
).all()
result = {}
for m in mappings:
try:
result[m.uuid] = int(m.remote_integer_id)
except ValueError:
pass
return result
# [/DEF:get_remote_ids_batch:Function]
# [/DEF:IdMappingService:Class]
# [/DEF:backend.src.core.mapping_service:Module]