Commit 183e9eb0 authored by harshavardhan.c's avatar harshavardhan.c

feat: Converted app to fastapi app.

parent 7845d0b6
__version__ = "v1.0.0"
......@@ -20,7 +20,7 @@ broker = KafkaBroker(
async def consume_stream_for_processing_dependencies(message: dict):
try:
await ModelCreatorAgent.model_creator_agent(
message=ModelCreatorSchema(meta=message)
message=ModelCreatorSchema(**message)
)
return True
except Exception as e:
......
# app.py
import asyncio
import logging as logger
import sys
import gc
from dotenv import load_dotenv
gc.collect()
load_dotenv()
from faststream import FastStream
from ut_dev_utils import configure_logger
import argparse
from agent_subscribers import broker
ap = argparse.ArgumentParser()
configure_logger()
if __name__ == "__main__":
from dotenv import load_dotenv
# Create FastStream app
app = FastStream(broker)
load_dotenv()
from ut_dev_utils import configure_logger
async def run_app():
try:
logger.info("Starting FastStream application...")
await app.run()
except KeyboardInterrupt:
logger.info("Application interrupted by user")
except Exception as e:
logger.error(f"Application error: {e}")
raise
finally:
logger.info("Application shutdown complete")
configure_logger()
import asyncio
import logging as logger
import sys
# Main execution
if __name__ == "__main__":
try:
# For better performance on Linux/Mac, use uvloop if available
if sys.platform != "win32":
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
logger.info("Using uvloop for better performance")
except ImportError:
logger.info("uvloop not available, using default event loop")
# Run the application
asyncio.run(run_app())
except KeyboardInterrupt:
print("\nApplication stopped by user")
except Exception as e:
logger.error(f"Failed to start application: {e}")
sys.exit(1)
from scripts.config import Services
ap.add_argument(
"--port",
"-p",
required=False,
default=Services.PORT,
help="Port to start the application.",
)
ap.add_argument(
"--bind",
"-b",
required=False,
default=Services.HOST,
help="IP to start the application.",
)
arguments = vars(ap.parse_args())
logger.info(f"App Starting at {arguments['bind']}:{arguments['port']}")
if sys.platform == "win32":
import uvicorn
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
uvicorn.run(
"main:app",
host=arguments["bind"],
port=int(arguments["port"]),
root_path="",
)
else:
from granian import Granian
from granian.constants import Interfaces
Granian(
"main:app",
address=arguments["bind"],
port=int(arguments["port"]),
interface=Interfaces.ASGI,
log_access=True,
log_enabled=True,
respawn_failed_workers=True,
threads=10,
threading_mode="runtime",
).serve()
import sys
from ut_dev_utils import FastAPIConfig, generate_fastapi_app
from ut_dev_utils.errors.exception_handlers import ExceptionHandlers
from __version__ import __version__
from scripts.config import PROJECT_NAME
from scripts.core.services import router
description = """
Databricks Platform Automation microservice for FTDMPC.
"""
tags_metadata = []
app_config = FastAPIConfig(
title="Databricks Platform Automation APP",
description=description,
version=__version__,
root_path="" if sys.platform == "win32" else "/dbx_mgmt",
tags_metadata=tags_metadata,
exception_handlers={
Exception: ExceptionHandlers.generic_exception_handler,
},
)
app = generate_fastapi_app(
app_config,
routers=[router],
project_name=PROJECT_NAME,
enable_default_openapi=True,
)
......@@ -61,27 +61,18 @@ class _KafkaConfig(BaseSettings):
class _DatabricksConfig(BaseSettings):
DATABRICKS_HOST: str
DATABRICKS_PORT: int = Field(default=443)
DATABRICKS_URI: str
DATABRICKS_HTTP_PATH: str
DATABRICKS_ACCESS_TOKEN: str
DATABRICKS_DEFAULT_PORT: int = Field(default=443)
DATABRICKS_CATALOG_NAME: str = Field(default="unified_model")
DATABRICKS_PUBLIC_SCHEMA_NAME: str = Field(default="public")
DATABRICKS_ANALYTICAL_SCHEMA_NAME: str = Field(default="analytical")
DATABRICKS_STORAGE_FORMAT: str = Field(default="PARQUET")
DATABRICKS_STORAGE_PATH: str = Field(
default="abfss://unity-catalog-storage@dbstoragenzxfhpgsipt5a.dfs.core.windows.net/416418955412087"
)
@model_validator(mode="before")
def prepare_databricks_uri(cls, values):
values["DATABRICKS_URI"] = (
f"databricks://token:{values['DATABRICKS_ACCESS_TOKEN']}@{values['DATABRICKS_HOST']}:{values['DATABRICKS_PORT']}"
f"?http_path={values['DATABRICKS_HTTP_PATH']}"
)
return values
DATABRICKS_CLUSTER_NAME: str = Field(default="UT-Steaming-Cluster")
DATABRICKS_CLUSTER_DISK_SIZE: int = Field(default=150)
DATABRICKS_CLUSTER_MIN_WORKERS: int = Field(default=1)
DATABRICKS_CLUSTER_SPARK_VERSION: str = Field(default="15.4.x-scala2.12")
DATABRICKS_CLUSTER_RUNTIME_VERSION: str = Field(default="9.1")
DATABRICKS_CLUSTER_NODE_TYPE_ID: str = Field(default="Standard_DS3_v2")
DATABRICKS_CLUSTER_DRIVER_NODE_TYPE_ID: str = Field(default="Standard_DS3_v2")
Services = _Services()
......@@ -98,4 +89,5 @@ __all__ = [
"PathToStorage",
"KafkaConfig",
"DatabricksConfig",
"PROJECT_NAME",
]
class DatabricksConstants:
METADATA_INGESTION_JOB_NAME = "metadata_ingestion_job"
METADATA_DELETION_JOB_NAME = "metadata_deletion_job"
TIMESERIES_INGESTION_JOB_NAME = "timeseries_ingestion_job"
METADATA_INGESTION_NOTEBOOK_NAME = "metadata_ingestion_notebook"
METADATA_DELETION_NOTEBOOK_NAME = "metadata_deletion_notebook"
TIMESERIES_INGESTION_NOTEBOOK_NAME = "timeseries_ingestion_notebook"
VOLUME_NAME = "unity_catalog_storage"
class NotebookConstants:
......
from fastapi import APIRouter
router = APIRouter()
from .v1 import v1_router
router.include_router(v1_router)
from fastapi import APIRouter
v1_router = APIRouter(prefix="/api/v1")
__all__ = ["v1_router"]
from .model_creator_services import model_creator_router
v1_router.include_router(model_creator_router)
import logging
from typing import Annotated
from fastapi import BackgroundTasks
from fastapi.params import Depends, Query
from faststream.confluent.fastapi import KafkaRouter
from ut_dev_utils.responses import DefaultResponseSchema
from ut_security_util import MetaInfoSchema
from ut_sql_utils.asyncio.declarative_utils import DeclarativeUtils
from scripts.config import KafkaConfig
from scripts.core.handlers.model_creator_handler import ModelCreatorHandler
from scripts.db.psql import get_declarative_utils
from scripts.decorators.databricks_validator import get_databricks_config
from scripts.schemas import ModelCreatorSchema
model_creator_router = KafkaRouter(KafkaConfig.KAFKA_URI)
@model_creator_router.get("/model_creator")
async def add_to_stream(
meta: MetaInfoSchema,
bg_task: BackgroundTasks,
payload: Annotated[ModelCreatorSchema, Depends(get_databricks_config)],
declarative_utils: DeclarativeUtils = Depends(get_declarative_utils),
analytical: bool = Query(default=False),
):
model_cal_obj = ModelCreatorHandler(
declarative_utils=declarative_utils, meta=meta, message=payload
)
logging.info("Adding background task for model creation...")
bg_task.add_task(
model_cal_obj.create_models_in_unity_catalog, analytical=analytical
)
return DefaultResponseSchema(message="Model creation task added to stream")
from typing import Dict, List
from sqlalchemy import (
BigInteger,
Column,
Date,
DateTime,
Integer,
MetaData,
String,
Table,
)
from scripts.utils.databricks_utils import DatabricksSQLUtility
from scripts.utils.model_convertor_utils import TypeMapper
class DataBricksSQLLayer(DatabricksSQLUtility):
def __init__(self, catalog_name: str, project_id: str, schema: str):
super().__init__(catalog_name, project_id)
self.schema = schema
def create_external_table_from_structure(
self,
table: Table,
external_location: str,
file_format: str = "PARQUET",
table_properties: Dict[str, str] = None,
partition_columns: list = None,
) -> str:
"""
Create an external table from a model class.
Args:
table: The model class to create the external table from.
external_location: The external location path.
file_format: The file format of the data files.
table_properties: Additional table properties.
partition_columns: List of columns to partition the table by.
Returns:
External Location - Returns the external location
class DatabricksManager:
def __init__(self, databricks_host: str, access_token: str):
"""
schema_table = f"{table.schema}.{table.name}" if table.schema else table.name
columns_sql = TypeMapper().extract_columns_without_constraints(table)
external_location = (
f"{external_location}/{self.catalog_name}/{file_format}/{schema_table}"
)
sql_parts = [
f"CREATE TABLE IF NOT EXISTS {schema_table}",
f"({columns_sql})",
f"USING {file_format}",
f"LOCATION '{external_location}'",
]
if partition_columns:
partition_clause = ", ".join(partition_columns)
sql_parts.append(f"PARTITIONED BY ({partition_clause})")
if table_properties:
props = [f"'{k}' = '{v}'" for k, v in table_properties.items()]
props_sql = ",\n ".join(props)
sql_parts.append(f"TBLPROPERTIES (\n {props_sql}\n)")
create_sql = "\n".join(sql_parts)
self.execute_sql_statement(create_sql)
return external_location
Initialize Databricks Manager
def create_timeseries_table(self, columns: List[str], external_location: str):
"""
Create a timeseries table model and all columns will be of type String
Args:
columns: List of columns in the table
external_location: The external location path
Example:
columns = [l1,l2,enterprise]
Returns:
Timeseries Table model
databricks_host: Your Databricks workspace URL
access_token: Personal access token or service principal token
"""
table_columns = [
Column("timestamp", BigInteger, nullable=False),
Column("dt_timestamp", DateTime, nullable=False),
Column("dt_date", Date, nullable=False),
Column("dt_hour", Integer, nullable=False),
Column("value", String, nullable=False),
Column("value_type", String, nullable=False, default="float"),
Column("c3", String, nullable=False),
]
default_columns = ["c1", "c5", "Q", "T", "D", "P", "A", "B", *columns]
table_columns.extend(
[Column(col_name, String, nullable=True) for col_name in default_columns]
self.host = (
databricks_host
if "https://" in databricks_host
else f"https://{databricks_host}"
)
partition_columns = ["dt_date", "dt_hour", "c3"]
table_properties = {
"parquet.compression": "snappy", # Fast decompression for frequent queries
"parquet.page.size": "524288", # 512KB - better time-range filtering
"parquet.block.size": "268435456", # 256MB - efficient sequential reads
"serialization.format": "1", # Support for arrays/complex types
self.headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/json",
}
table_obj = Table(
"timeseries_data", MetaData(), *table_columns, schema=self.schema
)
self.create_external_table_from_structure(
table=table_obj,
external_location=external_location,
partition_columns=partition_columns,
table_properties=table_properties,
)
return external_location
import logging
import time
from typing import Union
from scripts.config import DatabricksConfig
from scripts.db.databricks import DatabricksManager
from scripts.utils.httpx_util import HTTPXRequestUtil
class DatabricksClusterManager(DatabricksManager):
def __init__(self, databricks_host: str, access_token: str):
"""
Initialize Databricks cluster manager
databricks_host: Your Databricks workspace URL
access_token: Personal access token or service principal token
"""
super().__init__(databricks_host, access_token)
self.base_url = f"{self.host}/api/2.1/clusters"
def create_cluster(self, cluster_config: dict):
"""
Create a new cluster in Databricks
Args:
cluster_config: Dictionary containing cluster configuration
Returns:
str: Cluster ID if successful, None if failed
"""
url = f"{self.base_url}/create"
response = HTTPXRequestUtil(url).post(headers=self.headers, json=cluster_config)
if response.status_code != 200:
logging.error(f"Failed to create cluster: {response.text}")
return None
cluster_id = response.json().get("cluster_id")
if not cluster_id:
logging.error("No cluster_id returned from create request")
return None
logging.info(f"Cluster created with ID: {cluster_id}")
# Wait for cluster to be ready
if self.wait_for_cluster_ready(cluster_id):
logging.info(f"Cluster {cluster_id} is ready for use!")
else:
logging.error(f"Cluster {cluster_id} failed to start within timeout")
return cluster_id
def fetch_cluster_stats(self, cluster_id) -> dict:
"""
Fetch the status of a cluster
Args:
cluster_id: The ID of the cluster
"""
url = f"{self.base_url}/get"
params = {"cluster_id": cluster_id}
response = HTTPXRequestUtil(url).get(headers=self.headers, params=params)
if response.status_code == 200:
return response.json()
else:
logging.error(f"Error checking cluster: {response.text}")
return {}
def start_cluster(self, cluster_id: str) -> bool:
"""
Start a terminated cluster
Args:
cluster_id: ID of the cluster to start
Returns:
bool: True if start request successful, False otherwise
"""
url = f"{self.base_url}/start"
payload = {"cluster_id": cluster_id}
response = HTTPXRequestUtil(url).post(headers=self.headers, json=payload)
if response.status_code != 200:
logging.error(f"Failed to create cluster: {response.text}")
return False
cluster_id = response.json().get("cluster_id")
if not cluster_id:
logging.error("No cluster_id returned from create request")
return False
logging.info(f"Cluster created with ID: {cluster_id}")
# Wait for cluster to be ready
if self.wait_for_cluster_ready(cluster_id):
logging.info(f"Cluster {cluster_id} is ready for use!")
else:
logging.error(f"Cluster {cluster_id} failed to start within timeout")
return True
def get_existing_cluster_by_name(self, cluster_name: str) -> Union[None, dict]:
"""
Check if a cluster with the given name already exists
Args:
cluster_name: Name of the cluster to search for
Returns:
dict: Cluster info if found, None if not found
"""
url = f"{self.base_url}/list"
response = HTTPXRequestUtil(url).get(headers=self.headers)
if response.status_code == 200:
clusters = response.json().get("clusters", [])
for cluster in clusters:
if cluster.get("cluster_name") == cluster_name:
return cluster
else:
logging.warning(f"Warning: Could not list clusters: {response.text}")
return None
def get_streaming_cluster_config(
self, cluster_name: str = "UT-Steaming-Cluster"
) -> dict:
"""
Get configuration for a continuous streaming cluster optimized for Event Hub processing
Args:
cluster_name: Name for the cluster (default: "UT-Steaming-Cluster")
Returns:
dict: Complete cluster configuration
"""
return {
"cluster_name": cluster_name,
"spark_version": DatabricksConfig.DATABRICKS_CLUSTER_SPARK_VERSION,
"node_type_id": DatabricksConfig.DATABRICKS_CLUSTER_NODE_TYPE_ID, # 8 cores, 16GB RAM
"driver_node_type_id": DatabricksConfig.DATABRICKS_CLUSTER_DRIVER_NODE_TYPE_ID,
# CRITICAL: Never auto-terminate
"auto_termination_minutes": 0, # 0 = NEVER terminate
# Auto-scaling for variable loads
"autoscale": {
"min_workers": DatabricksConfig.DATABRICKS_CLUSTER_MIN_WORKERS, # Minimum cost
"max_workers": 8, # Scale up for high Event Hub volume
},
# "is_single_node": True,
# Streaming optimizations
"spark_conf": self.get_spark_config(),
# Reliability settings
"azure_attributes": {
"availability": "ON_DEMAND_AZURE", # Most reliable
"first_on_demand": 1,
},
# Storage for checkpoints and logs
"enable_elastic_disk": True,
"disk_spec": {
"disk_type": {"azure_disk_volume_type": "PREMIUM_LRS"},
"disk_size": DatabricksConfig.DATABRICKS_CLUSTER_DISK_SIZE,
},
# Monitoring tags
"custom_tags": {
"purpose": "continuous_streaming",
"workload": "eventhub_processing",
"criticality": "high",
"auto_terminate": "never",
},
# Unity Catalog
"data_security_mode": "SINGLE_USER",
}
@staticmethod
def get_spark_config() -> dict:
return {
"spark.executor.memory": "6g",
"spark.driver.memory": "5g",
"spark.executor.cores": "3", # Reduced from 4 to 3 (leave 1 core for OS)
"spark.serializer": "org.apache.spark.serializer.KryoSerializer",
"spark.executor.instances": "4",
"spark.sql.shuffle.partitions": "32",
"spark.executor.extraJavaOptions": "-XX:+UseG1GC -XX:MaxGCPauseMillis=200",
"spark.driver.extraJavaOptions": "-XX:+UseG1GC -XX:MaxGCPauseMillis=200",
}
def wait_for_cluster_ready(
self, cluster_id: str, timeout_minutes: int = 10
) -> bool:
"""
Wait for cluster with exponential backoff for more efficient polling
"""
timeout_seconds = timeout_minutes * 60
start_time = time.time()
check_interval = 10 # Start with 10 seconds
max_interval = 90 # Max 90 seconds between checks
while time.time() - start_time < timeout_seconds:
cluster_stats = self.fetch_cluster_stats(cluster_id)
if cluster_stats:
state = cluster_stats.get("state", "UNKNOWN")
if state == "RUNNING":
return True
elif state in ["TERMINATED", "TERMINATING", "ERROR"]:
return False
elif state in ["PENDING", "RESTARTING", "RESIZING"]:
# These are transitional states - keep waiting
logging.info(
f"Cluster {cluster_id} is starting... Current state: {state}"
)
else:
logging.warning(f"Unknown cluster state: {state}")
# Exponential backoff
logging.info(
f"Cluster {cluster_id} not ready yet. Waiting {check_interval} seconds..."
)
time.sleep(check_interval)
check_interval = min(check_interval * 1.5, max_interval)
return False
def get_http_path_details_by_cluster_id(self, cluster_id: str, workspace_url: str):
return f"/sql/protocolv1/o/{self.extract_org_id(workspace_url)}/{cluster_id}"
@staticmethod
def extract_org_id(workspace_url: str):
"""Extract organization ID from Azure Databricks URL"""
# From URL like: https://adb-416418955412087.7.azuredatabricks.net
# Extract: 416418955412087
import re
match = re.search(r"adb-(\d+)", workspace_url.replace("https://", ""))
return match.group(1) if match else None
from typing import Dict, List
from sqlalchemy import (
BigInteger,
Column,
Date,
DateTime,
Integer,
MetaData,
String,
Table,
)
from scripts.utils.databricks_utils import DatabricksSQLUtility
from scripts.utils.model_convertor_utils import TypeMapper
class DataBricksSQLLayer(DatabricksSQLUtility):
def __init__(self, catalog_name: str, project_id: str, schema: str):
super().__init__(catalog_name=catalog_name, project_id=project_id)
self.schema = schema
def create_external_table_from_structure(
self,
table: Table,
external_location: str,
file_format: str = "PARQUET",
table_properties: Dict[str, str] = None,
partition_columns: list = None,
) -> str:
"""
Create an external table from a model class.
Args:
table: The model class to create the external table from.
external_location: The external location path.
file_format: The file format of the data files.
table_properties: Additional table properties.
partition_columns: List of columns to partition the table by.
Returns:
External Location - Returns the external location
"""
schema_table = f"{table.schema}.{table.name}" if table.schema else table.name
columns_sql = TypeMapper().extract_columns_without_constraints(table)
external_location = (
f"{external_location}/{self.catalog_name}/{file_format}/{schema_table}"
)
sql_parts = [
f"CREATE TABLE IF NOT EXISTS {schema_table}",
f"({columns_sql})",
f"USING {file_format}",
f"LOCATION '{external_location}'",
]
if partition_columns:
partition_clause = ", ".join(partition_columns)
sql_parts.append(f"PARTITIONED BY ({partition_clause})")
if table_properties:
props = [f"'{k}' = '{v}'" for k, v in table_properties.items()]
props_sql = ",\n ".join(props)
sql_parts.append(f"TBLPROPERTIES (\n {props_sql}\n)")
create_sql = "\n".join(sql_parts)
self.execute_sql_statement(create_sql)
return external_location
def create_timeseries_table(self, columns: List[str], external_location: str):
"""
Create a timeseries table model and all columns will be of type String
Args:
columns: List of columns in the table
external_location: The external location path
Example:
columns = [l1,l2,enterprise]
Returns:
Timeseries Table model
"""
table_columns = [
Column("timestamp", BigInteger, nullable=False),
Column("dt_timestamp", DateTime, nullable=False),
Column("dt_date", Date, nullable=False),
Column("dt_hour", Integer, nullable=False),
Column("value", String, nullable=False),
Column("value_type", String, nullable=False, default="float"),
Column("c3", String, nullable=False),
]
default_columns = ["c1", "c5", "Q", "T", "D", "P", "A", "B", *columns]
table_columns.extend(
[Column(col_name, String, nullable=True) for col_name in default_columns]
)
partition_columns = ["dt_date", "dt_hour", "c3"]
table_properties = {
"parquet.compression": "snappy", # Fast decompression for frequent queries
"parquet.page.size": "524288", # 512KB - better time-range filtering
"parquet.block.size": "268435456", # 256MB - efficient sequential reads
"serialization.format": "1", # Support for arrays/complex types
}
table_obj = Table(
"timeseries_data", MetaData(), *table_columns, schema=self.schema
)
self.create_external_table_from_structure(
table=table_obj,
external_location=external_location,
partition_columns=partition_columns,
table_properties=table_properties,
)
return external_location
import logging
from typing import Dict, List
from ut_security_util.security_tools.auth_util import HTTPXRequestHandler
from scripts.db.databricks import DatabricksManager
from scripts.utils.httpx_util import HTTPXRequestUtil
class DatabricksJobManager:
class DatabricksJobManager(DatabricksManager):
def __init__(self, databricks_host: str, access_token: str):
"""
Initialize Databricks job manager
......@@ -14,15 +14,8 @@ class DatabricksJobManager:
databricks_host: Your Databricks workspace URL
access_token: Personal access token or service principal token
"""
self.host = (
databricks_host
if "https://" in databricks_host
else f"https://{databricks_host}"
)
self.headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/json",
}
super().__init__(databricks_host, access_token)
self.base_url = f"{self.host}/api/2.1/jobs"
def create_job(self, job_config: dict):
"""
......@@ -31,7 +24,7 @@ class DatabricksJobManager:
Args:
job_config: Dictionary containing job configuration
"""
url = f"{self.host}/api/2.1/jobs/create"
url = f"{self.base_url}/create"
response = HTTPXRequestUtil(url).post(headers=self.headers, json=job_config)
......@@ -53,7 +46,7 @@ class DatabricksJobManager:
job_id: The ID of the job to run
parameters: Dictionary of parameters to pass to the job
"""
url = f"{self.host}/api/2.1/jobs/run-now"
url = f"{self.base_url}/run-now"
payload = {"job_id": job_id}
......@@ -78,12 +71,10 @@ class DatabricksJobManager:
Args:
run_id: The ID of the job run
"""
url = f"{self.host}/api/2.1/jobs/runs/get"
url = f"{self.base_url}/runs/get"
params = {"run_id": run_id}
response = HTTPXRequestHandler(url).get(
url, headers=self.headers, params=params
)
response = HTTPXRequestUtil(url).get(url, headers=self.headers, params=params)
if response.status_code == 200:
return response.json()
......@@ -93,14 +84,57 @@ class DatabricksJobManager:
)
return None
def get_job_runs(
self, job_id: int, active_only: bool = False, limit: int = 20
) -> List[Dict]:
url = f"{self.base_url}/runs/list"
params = {
"job_id": job_id,
"limit": limit,
"active_only": "true" if active_only else "false",
}
response = HTTPXRequestUtil(url).get(headers=self.headers, params=params)
response.raise_for_status()
return response.json().get("runs", [])
def is_job_running(self, job_id) -> Dict:
"""
Check if a job has any active runs
Returns:
Dict with 'is_running' boolean and 'active_runs' list
"""
try:
active_runs = self.get_job_runs(job_id, active_only=True)
running_states = ["PENDING", "RUNNING", "TERMINATING"]
active_running_runs = [
run
for run in active_runs
if run.get("state", {}).get("life_cycle_state") in running_states
]
return {
"is_running": len(active_running_runs) > 0,
"active_runs": active_running_runs,
"total_active_runs": len(active_running_runs),
}
except Exception as e:
logging.error(f"Error checking job status: {e}")
return {"is_running": False, "active_runs": [], "total_active_runs": 0}
@staticmethod
def create_job_config_for_serverless(notebook_path: str, job_name: str):
def create_job_config_for_serverless(
notebook_path: str, job_name: str, job_parameters: dict, tags: dict
):
"""
Create job configuration for a parameterized notebook
Args:
notebook_path: Path to the notebook in Databricks workspace
job_name: Name of the job
job_parameters: Dictionary of parameters to pass to the notebook
tags: Dictionary of tags to apply to the job
"""
return {
......@@ -110,18 +144,44 @@ class DatabricksJobManager:
"task_key": "table_update_task",
"notebook_task": {
"notebook_path": notebook_path,
"base_parameters": {"input_message": "default_value"},
"base_parameters": job_parameters,
},
"timeout_seconds": 3600,
}
],
"max_concurrent_runs": 10,
"tags": {
"purpose": (
"metadata_ingestion"
if "ingestion" in job_name
else "metadata_deletion"
),
"compute_type": "serverless",
"tags": tags,
}
@staticmethod
def create_job_config_for_server(
notebook_path: str,
job_name: str,
job_parameters: dict,
tags: dict,
cluster_config: dict,
):
"""
Create job configuration for a parameterized notebook
Args:
notebook_path: Path to the notebook in Databricks workspace
job_name: Name of the job
job_parameters: Dictionary of parameters to pass to the notebook
tags: Dictionary of tags to apply to the job
cluster_config: Dictionary of cluster configuration ({"existing_cluster_id": cluster_id})
"""
return {
"name": job_name,
**cluster_config,
"notebook_task": {
"notebook_path": notebook_path,
"base_parameters": job_parameters,
},
"timeout_seconds": 0, # No timeout - run indefinitely
"max_concurrent_runs": 1,
"max_retries": -1, # Infinite retries
"retry_on_timeout": True,
"tags": tags,
}
import logging
from typing import List, Union
from scripts.db.databricks import DatabricksManager
from scripts.utils.httpx_util import HTTPXRequestUtil
class DatabricksLibraryManager(DatabricksManager):
def __init__(self, databricks_host: str, access_token: str):
"""
Initialize Databricks cluster manager
databricks_host: Your Databricks workspace URL
access_token: Personal access token or service principal token
"""
super().__init__(databricks_host, access_token)
self.base_url = f"{self.host}/api/2.0/libraries"
def install_libraries(self, libraries: Union[str, list], cluster_id: str):
"""
Install libraries in Databricks cluster
libraries: List of library names or single library name
cluster_id: ID of the cluster to install libraries in
"""
url = f"{self.base_url}/install"
payload = {"cluster_id": cluster_id, "libraries": libraries}
response = HTTPXRequestUtil(url).post(headers=self.headers, json=payload)
if response.status_code == 200:
logging.info("Libraries installed successfully")
else:
logging.error(f"Failed to install libraries: {response.text}")
@staticmethod
def default_libraries() -> List[dict]:
return [
{"pypi": {"package": "azure-eventhub"}},
{
"maven": {
"coordinates": "com.microsoft.azure:azure-eventhubs-spark_2.12:2.3.22"
}
},
]
import base64
import logging
from scripts.db.databricks import DatabricksManager
from scripts.utils.httpx_util import HTTPXRequestUtil
class NotebookManager:
class DatabricksNotebookManager(DatabricksManager):
def __init__(self, databricks_host, access_token):
"""
Initialize Databricks connection
......@@ -13,15 +14,7 @@ class NotebookManager:
databricks_host: Your Databricks workspace URL (e.g., 'https://your-workspace.cloud.databricks.com')
access_token: Personal access token or service principal token
"""
self.host = (
databricks_host
if "https://" in databricks_host
else f"https://{databricks_host}"
)
self.headers = {
"Authorization": f"Bearer {access_token}",
"Content-Type": "application/json",
}
super().__init__(databricks_host, access_token)
def create_notebook(
self, notebook_path, notebook_code: str, language="PYTHON", overwrite=True
......
from ut_sql_utils.asyncio import SQLSessionManager
from ut_sql_utils.asyncio import DeclarativeBaseClassFactory, SQLSessionManager
from ut_sql_utils.asyncio.declarative_utils import DeclarativeUtilsFactory
from scripts.db.redis.project_details import project_details_db
sql_database = "unified_model"
Base = DeclarativeBaseClassFactory(sql_database)
session_manager = SQLSessionManager(project_details_db)
get_db = session_manager.get_db_factory(database=sql_database)
get_declarative_utils = DeclarativeUtilsFactory.get_declarative_utils_factory(
sql_database, session_manager
)
import logging
from typing import Annotated, Optional
from fastapi import Cookie, Depends, Header, HTTPException, Request
from ut_dev_utils import ILensErrors
from scripts.config import DatabricksConfig
from scripts.db.redis.project_details import fetch_level_details
from scripts.schemas import ModelCreatorSchema
async def get_project_id_advanced(
request: Request,
# Cookie parameter
project_id_cookie: Annotated[Optional[str], Cookie(alias="projectId")] = None,
# Header parameter
project_id_header: Annotated[Optional[str], Header(alias="projectId")] = None,
) -> str:
"""Extract project_id with priority: Cookie > Header > Body > Query"""
project_id = (
project_id_cookie
or project_id_header
or request.query_params.get("project_id")
or request.query_params.get("projectId")
)
# Try to get from request body if not found
if not project_id and request.method in ["POST", "PUT", "PATCH"]:
try:
body = await request.json()
project_id = body.get("project_id")
except Exception as e:
logging.exception(f"Error getting project_id from request body: {e}")
pass
if not project_id:
raise HTTPException(
status_code=400,
detail={
"error": "project_id not found",
"sources_checked": [
"cookies",
"headers",
"query_params",
"request_body",
],
"example": "Add project_id in cookie, header, query param, or request body",
},
)
return project_id
async def get_databricks_config(project_id: str = Depends(get_project_id_advanced)):
"""Get Databricks configuration using project_id"""
try:
return get_databricks_details_from_redis(project_id)
except (ValueError, ILensErrors) as e:
raise ILensErrors(message=f"Configuration Error: {str(e)}")
def get_databricks_details_from_redis(project_id: str) -> ModelCreatorSchema:
project_details = fetch_level_details(project_id, raw=True)
if not project_details or "databricks_details" not in project_details:
raise ILensErrors(message=f"No Databricks config for project {project_id}")
db_config = project_details["databricks_details"]
required_keys = [
"databricks_host",
"databricks_access_token",
"databricks_storage_path",
"eventhub_connection_string",
]
if missing := [k for k in required_keys if not db_config.get(k)]:
raise ILensErrors(
message=f'Missing: {", ".join(missing)} for project {project_id}'
)
return ModelCreatorSchema(
**{k: db_config[k] for k in required_keys},
databricks_port=db_config.get(
"databricks_port", DatabricksConfig.DATABRICKS_DEFAULT_PORT
),
)
......@@ -13,7 +13,7 @@ class ModelCreatorAgent:
async def model_creator_agent(message: ModelCreatorSchema):
declarative_utils = await DeclarativeUtilsFactory.get_declarative_utils(
raw_database="unified_model",
project_id=message.meta.project_id,
project_id=message.project_id,
session_manager=session_manager,
schema=message.schema,
)
......
from ut_dev_utils.errors import ILensErrors
class ExternalServiceError(ILensErrors):
"""Raised when external service calls fail"""
def __init__(self, message: str, status_code: int = 200):
super().__init__(message=message, status_code=status_code)
class ResourceNotFoundError(ILensErrors):
"""Raised when a requested resource is not found"""
def __init__(self, message: str, status_code: int = 200):
super().__init__(message=message, status_code=status_code)
class GenericErrors(ILensErrors):
"""Raised when external service calls fail"""
def __init__(self, message: str, status_code: int = 200):
super().__init__(message=message, status_code=status_code)
from typing import Any, Dict, List, Optional, Union
from pydantic import BaseModel, Field, model_validator
from ut_security_util import MetaInfoSchema
from pydantic import BaseModel, Field, computed_field, model_validator
from scripts.config import DatabricksConfig
class ModelCreatorSchema(BaseModel):
meta: MetaInfoSchema
schema: Optional[str] = DatabricksConfig.DATABRICKS_PUBLIC_SCHEMA_NAME
databricks_host: str = DatabricksConfig.DATABRICKS_HOST
databricks_port: int = DatabricksConfig.DATABRICKS_PORT
databricks_access_token: str = DatabricksConfig.DATABRICKS_ACCESS_TOKEN
databricks_http_path: str = DatabricksConfig.DATABRICKS_HTTP_PATH
databricks_host: str
databricks_port: int
databricks_access_token: str
databricks_user_email: str = "aniket.dhale@ilenscloud.onmicrosoft.com"
databricks_storage_path: str = DatabricksConfig.DATABRICKS_STORAGE_PATH
databricks_storage_path: str
databricks_http_path: Optional[str] = None
eventhub_connection_string: str
@computed_field
@property
def databricks_uri(self) -> Optional[str]:
"""Automatically computed databricks URI that updates when databricks_http_path changes"""
if self.databricks_http_path:
return (
f"databricks://token:{self.databricks_access_token}@{self.databricks_host}:{self.databricks_port}"
f"?http_path={self.databricks_http_path}"
)
return None
class ModelInstanceSchema(BaseModel):
......@@ -25,15 +35,23 @@ class ModelInstanceSchema(BaseModel):
sql_schema: Optional[str] = Field(
default=DatabricksConfig.DATABRICKS_PUBLIC_SCHEMA_NAME, alias="schema"
)
databricks_host: str = DatabricksConfig.DATABRICKS_HOST
databricks_port: int = DatabricksConfig.DATABRICKS_PORT
databricks_access_token: str = DatabricksConfig.DATABRICKS_ACCESS_TOKEN
databricks_http_path: str = DatabricksConfig.DATABRICKS_HTTP_PATH
databricks_user_email: str = "aniket.dhale@ilenscloud.onmicrosoft.com"
databricks_storage_path: str = DatabricksConfig.DATABRICKS_STORAGE_PATH
databricks_host: str
databricks_port: int
databricks_access_token: str
databricks_http_path: str
databricks_user_email: str
databricks_storage_path: str
@model_validator(mode="before")
def validate_data(cls, values: Dict[str, Any]) -> Dict[str, Any]:
if "data" in values and isinstance(values["data"], dict):
values["data"] = [values["data"]]
return values
@model_validator(mode="before")
def prepare_databricks_uri(cls, values):
values["databricks_uri"] = (
f"databricks://token:{values['databricks_access_token']}@{values['databricks_host']}:{values['databricks_port']}"
f"?http_path={values['databricks_http_path']}"
)
return values
......@@ -4,8 +4,6 @@ from typing import Optional
from sqlalchemy import create_engine, text
from ut_dev_utils import get_db_name
from scripts.config import DatabricksConfig
class DatabricksSQLUtility:
def __init__(self, catalog_name: str, project_id: str):
......@@ -18,7 +16,7 @@ class DatabricksSQLUtility:
self.catalog_name = get_db_name(project_id=project_id, database=catalog_name)
self.engine = None
def connect_to_databricks(self):
def connect_to_databricks(self, databricks_uri: str):
"""
Connect to Databricks using sqlalchemy-databricks
"""
......@@ -26,7 +24,7 @@ class DatabricksSQLUtility:
# Build connection string for sqlalchemy-databricks
self.engine = create_engine(
DatabricksConfig.DATABRICKS_URI,
databricks_uri,
pool_pre_ping=True,
pool_recycle=3600,
echo=False,
......@@ -160,6 +158,25 @@ class DatabricksSQLUtility:
)
raise
def create_volume(self, volume_name: str, location_name: str = None) -> str:
"""
Create a volume in Unity Catalog
volume_name: Name for the volume(<catalog>.<schema>.<external-volume-name>)
location_name: Name of the external location
"""
if location_name:
ddl = f"CREATE EXTERNAL VOLUME IF NOT EXISTS {volume_name}"
ddl += f"\nLOCATION '{location_name}'"
else:
ddl = f"CREATE VOLUME IF NOT EXISTS `{volume_name}`"
try:
self.execute_sql_statement(ddl)
logger.info(f"Volume '{volume_name}' created successfully")
return volume_name
except Exception as e:
logger.error(f"Failed to create volume '{volume_name}': {str(e)}")
raise
def execute_sql_statement(self, query: str):
try:
with self.engine.connect() as conn:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment