ss-tools/backend/src/core/mapping_service.py

# [DEF:backend.src.core.mapping_service:Module]
#
# @TIER:      CRITICAL
# @SEMANTICS: mapping, ids, synchronization, environments, cross-filters
# @PURPOSE:   Service for tracking and synchronizing Superset Resource IDs (UUID <-> Integer ID)
# @LAYER:     Core
# @RELATION:  DEPENDS_ON -> backend.src.models.mapping (ResourceMapping, ResourceType)
# @RELATION:  DEPENDS_ON -> backend.src.core.logger
# @TEST_DATA: mock_superset_resources -> {'chart': [{'id': 42, 'uuid': '1234', 'slice_name': 'test'}], 'dataset': [{'id': 99, 'uuid': '5678', 'table_name': 'data'}]}
#
# @INVARIANT: sync_environment must handle remote API failures gracefully.

# [SECTION: IMPORTS]
from typing import Dict, List, Optional
from datetime import datetime, timezone
from sqlalchemy.orm import Session
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from src.models.mapping import ResourceMapping, ResourceType
from src.core.logger import logger, belief_scope
# [/SECTION]

# [DEF:IdMappingService:Class]
# @TIER: CRITICAL
# @PURPOSE: Service handling the cataloging and retrieval of remote Superset Integer IDs.
class IdMappingService:

    # [DEF:__init__:Function]
    # @PURPOSE: Initializes the mapping service.
    def __init__(self, db_session: Session):
        self.db = db_session
        self.scheduler = BackgroundScheduler()
        self._sync_job = None
    # [/DEF:__init__:Function]

    # [DEF:start_scheduler:Function]
    # @PURPOSE: Starts the background scheduler with a given cron string.
    # @PARAM: cron_string (str) - Cron expression for the sync interval.
    # @PARAM: environments (List[str]) - List of environment IDs to sync.
    # @PARAM: superset_client_factory - Function to get a client for an environment.
    def start_scheduler(self, cron_string: str, environments: List[str], superset_client_factory):
        with belief_scope("IdMappingService.start_scheduler"):
            if self._sync_job:
                self.scheduler.remove_job(self._sync_job.id)
                logger.info("[IdMappingService.start_scheduler][Reflect] Removed existing sync job.")

            def sync_all():
                for env_id in environments:
                    client = superset_client_factory(env_id)
                    if client:
                        self.sync_environment(env_id, client)

            self._sync_job = self.scheduler.add_job(
                sync_all,
                CronTrigger.from_crontab(cron_string),
                id='id_mapping_sync_job',
                replace_existing=True
            )

            if not self.scheduler.running:
                self.scheduler.start()
                logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Started background scheduler with cron: {cron_string}")
            else:
                logger.info(f"[IdMappingService.start_scheduler][Coherence:OK] Updated background scheduler with cron: {cron_string}")
    # [/DEF:start_scheduler:Function]

    # [DEF:sync_environment:Function]
    # @PURPOSE: Fully synchronizes mapping for a specific environment.
    # @PARAM:   environment_id (str) - Target environment ID.
    # @PARAM:   superset_client - Instance capable of hitting the Superset API.
    # @PRE: environment_id exists in the database.
    # @POST: ResourceMapping records for the environment are created or updated.
    def sync_environment(self, environment_id: str, superset_client, incremental: bool = False) -> None:
        """
        Polls the Superset APIs for the target environment and updates the local mapping table.
        If incremental=True, only fetches items changed since the max last_synced_at date.
        """
        with belief_scope("IdMappingService.sync_environment"):
            logger.info(f"[IdMappingService.sync_environment][Action] Starting sync for environment {environment_id} (incremental={incremental})")

            # Implementation Note: In a real scenario, superset_client needs to be an instance
            # capable of auth & iteration over /api/v1/chart/, /api/v1/dataset/, /api/v1/dashboard/
            # Here we structure the logic according to the spec.

            types_to_poll = [
                (ResourceType.CHART, "chart", "slice_name"),
                (ResourceType.DATASET, "dataset", "table_name"),
                (ResourceType.DASHBOARD, "dashboard", "slug") # Note: dashboard slug or dashboard_title
            ]

            total_synced = 0
            total_deleted = 0
            try:
                for res_enum, endpoint, name_field in types_to_poll:
                    logger.debug(f"[IdMappingService.sync_environment][Explore] Polling {endpoint} endpoint")

                    # Simulated API Fetch (Would be: superset_client.get(f"/api/v1/{endpoint}/")... )
                    # This relies on the superset API structure, e.g. { "result": [{"id": 1, "uuid": "...", name_field: "..."}] }
                    # We assume superset_client provides a generic method to fetch all pages.

                    try:
                        since_dttm = None
                        if incremental:
                            from sqlalchemy.sql import func
                            max_date = self.db.query(func.max(ResourceMapping.last_synced_at)).filter(
                                ResourceMapping.environment_id == environment_id,
                                ResourceMapping.resource_type == res_enum
                            ).scalar()

                            if max_date:
                                # We subtract a bit for safety overlap
                                from datetime import timedelta
                                since_dttm = max_date - timedelta(minutes=5)
                                logger.debug(f"[IdMappingService.sync_environment] Incremental sync since {since_dttm}")

                        resources = superset_client.get_all_resources(endpoint, since_dttm=since_dttm)

                        # Track which UUIDs we see in this sync cycle
                        synced_uuids = set()

                        for res in resources:
                            res_uuid = res.get("uuid")
                            raw_id = res.get("id")
                            res_name = res.get(name_field)

                            if not res_uuid or raw_id is None:
                                continue

                            synced_uuids.add(res_uuid)
                            res_id = str(raw_id) # Store as string

                            # Upsert Logic
                            mapping = self.db.query(ResourceMapping).filter_by(
                                environment_id=environment_id,
                                resource_type=res_enum,
                                uuid=res_uuid
                            ).first()

                            if mapping:
                                mapping.remote_integer_id = res_id
                                mapping.resource_name = res_name
                                mapping.last_synced_at = datetime.now(timezone.utc)
                            else:
                                new_mapping = ResourceMapping(
                                    environment_id=environment_id,
                                    resource_type=res_enum,
                                    uuid=res_uuid,
                                    remote_integer_id=res_id,
                                    resource_name=res_name,
                                    last_synced_at=datetime.now(timezone.utc)
                                )
                                self.db.add(new_mapping)

                            total_synced += 1

                        # Delete stale mappings: rows for this env+type whose UUID
                        # was NOT returned by the API (resource was deleted remotely)
                        # We only do this on full syncs, because incremental syncs don't return all UUIDs
                        if not incremental:
                            stale_query = self.db.query(ResourceMapping).filter(
                                ResourceMapping.environment_id == environment_id,
                                ResourceMapping.resource_type == res_enum,
                            )
                            if synced_uuids:
                                stale_query = stale_query.filter(
                                    ResourceMapping.uuid.notin_(synced_uuids)
                                )
                            deleted = stale_query.delete(synchronize_session="fetch")
                            if deleted:
                                total_deleted += deleted
                                logger.info(f"[IdMappingService.sync_environment][Action] Removed {deleted} stale {endpoint} mapping(s) for {environment_id}")

                    except Exception as loop_e:
                        logger.error(f"[IdMappingService.sync_environment][Reason] Error polling {endpoint}: {loop_e}")
                        # Continue to next resource type instead of blowing up the whole sync

                self.db.commit()
                logger.info(f"[IdMappingService.sync_environment][Coherence:OK] Successfully synced {total_synced} items and deleted {total_deleted} stale items.")

            except Exception as e:
                self.db.rollback()
                logger.error(f"[IdMappingService.sync_environment][Coherence:Failed] Critical sync failure: {e}")
                raise
    # [/DEF:sync_environment:Function]

    # [DEF:get_remote_id:Function]
    # @PURPOSE: Retrieves the remote integer ID for a given universal UUID.
    # @PARAM:   environment_id (str)
    # @PARAM:   resource_type (ResourceType)
    # @PARAM:   uuid (str)
    # @RETURN:  Optional[int]
    def get_remote_id(self, environment_id: str, resource_type: ResourceType, uuid: str) -> Optional[int]:
        mapping = self.db.query(ResourceMapping).filter_by(
            environment_id=environment_id,
            resource_type=resource_type,
            uuid=uuid
        ).first()

        if mapping:
            try:
                return int(mapping.remote_integer_id)
            except ValueError:
                return None
        return None
    # [/DEF:get_remote_id:Function]

    # [DEF:get_remote_ids_batch:Function]
    # @PURPOSE: Retrieves remote integer IDs for a list of universal UUIDs efficiently.
    # @PARAM:   environment_id (str)
    # @PARAM:   resource_type (ResourceType)
    # @PARAM:   uuids (List[str])
    # @RETURN:  Dict[str, int] - Mapping of UUID -> Integer ID
    def get_remote_ids_batch(self, environment_id: str, resource_type: ResourceType, uuids: List[str]) -> Dict[str, int]:
        if not uuids:
            return {}

        mappings = self.db.query(ResourceMapping).filter(
            ResourceMapping.environment_id == environment_id,
            ResourceMapping.resource_type == resource_type,
            ResourceMapping.uuid.in_(uuids)
        ).all()

        result = {}
        for m in mappings:
            try:
                result[m.uuid] = int(m.remote_integer_id)
            except ValueError:
                pass

        return result
    # [/DEF:get_remote_ids_batch:Function]

# [/DEF:IdMappingService:Class]
# [/DEF:backend.src.core.mapping_service:Module]