Files
ss-tools/backend/src/core/utils/matching.py

54 lines
2.0 KiB
Python

# [DEF:backend.src.core.utils.matching:Module]
#
# @SEMANTICS: fuzzy, matching, rapidfuzz, database, mapping
# @PURPOSE: Provides utility functions for fuzzy matching database names.
# @LAYER: Core
# @RELATION: DEPENDS_ON -> rapidfuzz
#
# @INVARIANT: Confidence scores are returned as floats between 0.0 and 1.0.
# [SECTION: IMPORTS]
from rapidfuzz import fuzz, process
from typing import List, Dict
# [/SECTION]
# [DEF:suggest_mappings:Function]
# @PURPOSE: Suggests mappings between source and target databases using fuzzy matching.
# @PRE: source_databases and target_databases are lists of dictionaries with 'uuid' and 'database_name'.
# @POST: Returns a list of suggested mappings with confidence scores.
# @PARAM: source_databases (List[Dict]) - Databases from the source environment.
# @PARAM: target_databases (List[Dict]) - Databases from the target environment.
# @PARAM: threshold (int) - Minimum confidence score (0-100).
# @RETURN: List[Dict] - Suggested mappings.
def suggest_mappings(source_databases: List[Dict], target_databases: List[Dict], threshold: int = 60) -> List[Dict]:
"""
Suggest mappings between source and target databases using fuzzy matching.
"""
suggestions = []
if not target_databases:
return suggestions
target_names = [db['database_name'] for db in target_databases]
for s_db in source_databases:
# Use token_sort_ratio as decided in research.md
match = process.extractOne(
s_db['database_name'],
target_names,
scorer=fuzz.token_sort_ratio
)
if match:
name, score, index = match
if score >= threshold:
suggestions.append({
"source_db_uuid": s_db['uuid'],
"target_db_uuid": target_databases[index]['uuid'],
"confidence": score / 100.0
})
return suggestions
# [/DEF:suggest_mappings:Function]
# [/DEF:backend.src.core.utils.matching:Module]