databrickslabs · waschtlgrea-dbx · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -130,6 +130,7 @@ ipython_config.py
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
+poetry.lock
 
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/

diff --git a/cli.py b/cli.py
@@ -10,9 +10,9 @@ def ip_access_list_analyzer(**args):
     import ip_access_list_analyzer.ip_acl_analyzer as analyzer
     analyzer.main(args)
 
-def sql_migration_assistant(**args):
+def sql_migration_assistant(**kwargs):
     from sql_migration_assistant import hello
-    hello()
+    hello(**kwargs)
 
 MAPPING = {
     "ip-access-list-analyzer": ip_access_list_analyzer,

diff --git a/sql-migration-assistant/.gitignore b/sql-migration-assistant/.gitignore
@@ -0,0 +1,2 @@
+.databrickscfg
+.databricks
diff --git a/sql_migration_assistant/README.md → sql-migration-assistant/README.md b/sql_migration_assistant/README.md → sql-migration-assistant/README.md
@@ -15,45 +15,44 @@ tags:
 # Project Legion - SQL Migration Assistant
 
 Legion is a Databricks field project to accelerate migrations on to Databricks leveraging the platform’s generative AI
-capabilities. It uses an LLM for code conversion and intent summarisation, presented to users in a front end web 
+capabilities. It uses an LLM for code conversion and intent summarisation, presented to users in a front end web
 application.
 
-Legion provides a chatbot interface to users for translating input code (for example T-SQL to Databricks SQL) and 
+Legion provides a chatbot interface to users for translating input code (for example T-SQL to Databricks SQL) and
 summarising the intent and business purpose of the code. This intent is then embedded for serving in a Vector Search
 index for finding similar pieces of code. This presents an opportunity for increased collaboration (find out who is
-working on similar projects), rationalisation (identify duplicates based on intent) and discoverability (semantic search).
+working on similar projects), rationalisation (identify duplicates based on intent) and discoverability (semantic
+search).
 
-Legion is a solution accelerator - it is *not* a fully baked solution. This is something for you the customer to take 
-on and own. This allows you to present a project to upskill your employees, leverage GenAI for a real use case, 
+Legion is a solution accelerator - it is *not* a fully baked solution. This is something for you the customer to take
+on and own. This allows you to present a project to upskill your employees, leverage GenAI for a real use case,
 customise the application to their needs and entirely own the IP.
 
 ## Installation Videos
 
-
 https://github.com/user-attachments/assets/e665bcf4-265f-4a47-81eb-60845a72c798
 
 https://github.com/user-attachments/assets/fa622f96-a78c-40b8-9eb9-f6671c4d7b47
 
 https://github.com/user-attachments/assets/1a58a1b5-2dcf-4624-b93f-214735162584
 
-
-
 Setting Legion up is a simple and automated process. Ensure you have the [Databricks CLI]
 (https://docs.databricks.com/en/dev-tools/cli/index.html) installed and configured with the correct workspace.
 
-Once the Databricks CLI has been installed and configured, run the following command to install the Databricks Labs 
+Once the Databricks CLI has been installed and configured, run the following command to install the Databricks Labs
 Sandbox and the SQL Migration Assistant.
+
 ```bash
 databricks labs install sandbox && databricks labs sandbox sql-migration-assistant
 ```
 
 ### What Legion needs - during setup above you will create or choose existing resources for the following:
 
 - A no-isolation shared cluster to host the front end application.
-- A catalog and schema in Unity Catalog. 
+- A catalog and schema in Unity Catalog.
 - A table to store the code intent statements and their embeddings.
-- A vector search endpoint and an embedding model: see docs 
-https://docs.databricks.com/en/generative-ai/vector-search.html#how-to-set-up-vector-search
-- A chat LLM. Pay Per Token is recomended where available, but the set up will also allow for creation of 
-a provisioned throughput endpoint.
+- A vector search endpoint and an embedding model: see docs
+  https://docs.databricks.com/en/generative-ai/vector-search.html#how-to-set-up-vector-search
+- A chat LLM. Pay Per Token is recomended where available, but the set up will also allow for creation of
+  a provisioned throughput endpoint.
 - A PAT stored in a secret scope chosen by you, under the key `sql-migration-pat`.
diff --git a/sql_migration_assistant/docs/Makefile → sql-migration-assistant/docs/Makefile b/sql_migration_assistant/docs/Makefile → sql-migration-assistant/docs/Makefile
diff --git a/...ion_assistant/docs/_static/css/custom.css → ...ion-assistant/docs/_static/css/custom.css b/...ion_assistant/docs/_static/css/custom.css → ...ion-assistant/docs/_static/css/custom.css
diff --git a/sql_migration_assistant/docs/conf.py → sql-migration-assistant/docs/conf.py b/sql_migration_assistant/docs/conf.py → sql-migration-assistant/docs/conf.py
@@ -13,7 +13,6 @@
 import os
 import sys
 
-
 sys.path.insert(0, os.path.abspath("../../python"))
 sys.path.append(os.path.abspath("./_theme"))
 # -- Project information -----------------------------------------------------

diff --git a/...sistant/docs/images/intent_generation.png → ...sistant/docs/images/intent_generation.png b/...sistant/docs/images/intent_generation.png → ...sistant/docs/images/intent_generation.png
diff --git a/...ion_assistant/docs/images/legion_logo.png → ...ion-assistant/docs/images/legion_logo.png b/...ion_assistant/docs/images/legion_logo.png → ...ion-assistant/docs/images/legion_logo.png
diff --git a/...on_assistant/docs/images/similar_code.png → ...on-assistant/docs/images/similar_code.png b/...on_assistant/docs/images/similar_code.png → ...on-assistant/docs/images/similar_code.png
diff --git a/...istant/docs/images/translation_prompt.png → ...istant/docs/images/translation_prompt.png b/...istant/docs/images/translation_prompt.png → ...istant/docs/images/translation_prompt.png
diff --git a/...istant/docs/images/translation_screen.png → ...istant/docs/images/translation_screen.png b/...istant/docs/images/translation_screen.png → ...istant/docs/images/translation_screen.png
diff --git a/sql_migration_assistant/docs/index.rst → sql-migration-assistant/docs/index.rst b/sql_migration_assistant/docs/index.rst → sql-migration-assistant/docs/index.rst
diff --git a/sql_migration_assistant/docs/reload.py → sql-migration-assistant/docs/reload.py b/sql_migration_assistant/docs/reload.py → sql-migration-assistant/docs/reload.py
diff --git a/...migration_assistant/docs/requirements.txt → ...migration-assistant/docs/requirements.txt b/...migration_assistant/docs/requirements.txt → ...migration-assistant/docs/requirements.txt
diff --git a/...ion_assistant/docs/usage/installation.rst → ...ion-assistant/docs/usage/installation.rst b/...ion_assistant/docs/usage/installation.rst → ...ion-assistant/docs/usage/installation.rst
diff --git a/sql_migration_assistant/docs/usage/usage.rst → sql-migration-assistant/docs/usage/usage.rst b/sql_migration_assistant/docs/usage/usage.rst → sql-migration-assistant/docs/usage/usage.rst
diff --git a/sql_migration_assistant/app/__init__.py → sql-migration-assistant/jobs/__init__.py b/sql_migration_assistant/app/__init__.py → sql-migration-assistant/jobs/__init__.py
diff --git a/...ration_assistant/jobs/bronze_to_silver.py → ...ration-assistant/jobs/bronze_to_silver.py b/...ration_assistant/jobs/bronze_to_silver.py → ...ration-assistant/jobs/bronze_to_silver.py
@@ -1,28 +1,22 @@
 # Databricks notebook source
 # DBTITLE 1,get params
 import json
+
 from pyspark.sql.types import (
-    ArrayType,
     StructType,
     StructField,
     StringType,
     MapType,
-    IntegerType,
-    TimestampType,
 )
-import pyspark.sql.functions as f
-from pyspark.sql.functions import udf, pandas_udf
 
 agent_configs = json.loads(dbutils.widgets.get("agent_configs"))
 app_configs = json.loads(dbutils.widgets.get("app_configs"))
 
-
 # COMMAND ----------
 
 checkpoint_dir = app_configs["VOLUME_NAME_CHECKPOINT_PATH"]
 volume_path = app_configs["VOLUME_NAME_INPUT_PATH"]
 
-
 # COMMAND ----------
 
 bronze_raw_code = f'{app_configs["CATALOG"]}.{app_configs["SCHEMA"]}.bronze_raw_code'
@@ -70,7 +64,6 @@
   """
 )
 
-
 silver_llm_responses = (
     f'{app_configs["CATALOG"]}.{app_configs["SCHEMA"]}.silver_llm_responses'
 )
@@ -87,7 +80,6 @@
   """
 )
 
-
 gold_table = (
     f'{app_configs["CATALOG"]}.{app_configs["SCHEMA"]}.gold_transformed_notebooks'
 )
@@ -104,7 +96,6 @@
   """
 )
 
-
 # COMMAND ----------
 
 # DBTITLE 1,convert agent_configs input string to a dataframe

diff --git a/sql_migration_assistant/jobs/call_agents.py → sql-migration-assistant/jobs/call_agents.py b/sql_migration_assistant/jobs/call_agents.py → sql-migration-assistant/jobs/call_agents.py
@@ -1,19 +1,14 @@
 # Databricks notebook source
+import json
+
+import pyspark.sql.functions as f
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.service.serving import ChatMessage, ChatMessageRole
-import json
-import os
+from pyspark.sql.functions import pandas_udf
 from pyspark.sql.types import (
-    ArrayType,
-    StructType,
-    StructField,
     StringType,
     MapType,
-    IntegerType,
-    TimestampType,
 )
-import pyspark.sql.functions as f
-from pyspark.sql.functions import udf, pandas_udf
 
 # COMMAND ----------
 

diff --git a/...igration_assistant/jobs/silver_to_gold.py → ...igration-assistant/jobs/silver_to_gold.py b/...igration_assistant/jobs/silver_to_gold.py → ...igration-assistant/jobs/silver_to_gold.py
@@ -1,10 +1,11 @@
 # Databricks notebook source
 import base64
+import json
+
 from databricks.sdk import WorkspaceClient
 from databricks.sdk.service.workspace import ImportFormat, Language
 from pyspark.sql import functions as f
 from pyspark.sql.types import *
-import json
 
 # COMMAND ----------
 
@@ -34,6 +35,7 @@
 prompt_id = dbutils.jobs.taskValues.get(taskKey="ingest_to_holding", key="promptID")
 output_volume_path = app_configs["VOLUME_NAME_OUTPUT_PATH"]
 
+
 # COMMAND ----------
 
 
@@ -110,7 +112,6 @@ def write_notebook_code(llm_responses):
 
 gold_df.display()
 
-
 # COMMAND ----------
 
 temp_table_name = "gold_temp"

diff --git a/sql-migration-assistant/requirements.txt b/sql-migration-assistant/requirements.txt
@@ -0,0 +1,11 @@
+databricks-sdk==0.30.0
+pyyaml
+databricks-labs-blueprint==0.8.2
+databricks-labs-lsql==0.9.0
+gradio==5.5.0
+aiohttp==3.10.5
+fastapi
+pydantic==2.8.2
+dbtunnel==0.14.6
+mlflow
+openai
diff --git a/...stant/run_app_from_databricks_notebook.py → ...stant/run_app_from_databricks_notebook.py b/...stant/run_app_from_databricks_notebook.py → ...stant/run_app_from_databricks_notebook.py
@@ -7,17 +7,13 @@
 # MAGIC If you want to share the app with users outside of Databricks, for example so non technical SMEs can contribute to LLM prompt development, the notebook needs to run on a no isolation shared cluster.
 
 # COMMAND ----------
-pip install databricks-sdk -U -q
+%pip install .
 
 # COMMAND ----------
-pip install gradio==4.27.0 pyyaml aiohttp==3.10.5 databricks-labs-blueprint==0.8.2 databricks-labs-lsql==0.9.0 -q
+dbutils.library.restartPython()
 
 # COMMAND ----------
-pip install fastapi==0.112.2 pydantic==2.8.2 dbtunnel==0.14.6 openai -q
 
-# COMMAND ----------
-dbutils.library.restartPython()
+from sql_migration_assistant.utils.runindatabricks import run_app
 
-# COMMAND ----------
-from utils.runindatabricks import run_app
-run_app()
+run_app()
diff --git a/sql-migration-assistant/setup.py b/sql-migration-assistant/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup, find_packages
+
+
+# Read the requirements.txt file
+def load_requirements(filename="requirements.txt"):
+    with open(filename, "r") as file:
+        return file.read().splitlines()
+
+
+setup(
+    name="sql_migration_assistant",
+    version="0.1",
+    packages=find_packages(where="src"),  # Specify src as the package directory
+    package_dir={"": "src"},
+    include_package_data=True,  # Include files specified in MANIFEST.in
+    package_data={
+        "sql_migration_assistant": ["config.yml"],  # Include YAML file
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    install_requires=load_requirements(),
+    python_requires=">=3.10",
+)
diff --git a/sql_migration_assistant/__init__.py → ...t/src/sql_migration_assistant/__init__.py b/sql_migration_assistant/__init__.py → ...t/src/sql_migration_assistant/__init__.py
@@ -1,12 +1,18 @@
-from sql_migration_assistant.utils.initialsetup import SetUpMigrationAssistant
-from databricks.sdk import WorkspaceClient
-from databricks.labs.blueprint.tui import Prompts
-import yaml
 from pathlib import Path
 
+import yaml
+from databricks.labs.blueprint.tui import Prompts
+from databricks.sdk import WorkspaceClient
+
+from sql_migration_assistant.utils.initialsetup import SetUpMigrationAssistant
+
 
-def hello():
-    w = WorkspaceClient(product="sql_migration_assistant", product_version="0.0.1")
+def hello(**kwargs):
+    w = WorkspaceClient(
+        product="sql_migration_assistant",
+        product_version="0.0.1",
+        profile=kwargs.get("profile"),
+    )
     p = Prompts()
     setter_upper = SetUpMigrationAssistant()
     setter_upper.check_cloud(w)

diff --git a/sql_migration_assistant/infra/__init__.py → ...c/sql_migration_assistant/app/__init__.py b/sql_migration_assistant/infra/__init__.py → ...c/sql_migration_assistant/app/__init__.py
diff --git a/sql_migration_assistant/app/llm.py → ...nt/src/sql_migration_assistant/app/llm.py b/sql_migration_assistant/app/llm.py → ...nt/src/sql_migration_assistant/app/llm.py
@@ -1,8 +1,5 @@
 import gradio as gr
 
-from databricks.sdk import WorkspaceClient
-from databricks.sdk.service.serving import ChatMessage, ChatMessageRole
-
 
 class LLMCalls:
     def __init__(self, openai_client, foundation_llm_name):
@@ -44,7 +41,7 @@ def call_llm(self, messages, max_tokens, temperature):
     def llm_translate(self, system_prompt, input_code, max_tokens, temperature):
         messages = [
             {"role": "system", "content": system_prompt},
-            {"role": "user", "content": input_code}
+            {"role": "user", "content": input_code},
         ]
 
         # call the LLM end point.
@@ -58,7 +55,7 @@ def llm_translate(self, system_prompt, input_code, max_tokens, temperature):
     def llm_intent(self, system_prompt, input_code, max_tokens, temperature):
         messages = [
             {"role": "system", "content": system_prompt},
-            {"role": "user", "content": input_code}
+            {"role": "user", "content": input_code},
         ]
 
         # call the LLM end point.

diff --git a/sql_migration_assistant/app/prompt_helper.py → ..._migration_assistant/app/prompt_helper.py b/sql_migration_assistant/app/prompt_helper.py → ..._migration_assistant/app/prompt_helper.py
@@ -1,4 +1,6 @@
 import gradio as gr
+
+
 class PromptHelper:
     def __init__(self, see, catalog, schema, prompt_table):
         self.see = see

diff --git a/sql_migration_assistant/app/similar_code.py → ...l_migration_assistant/app/similar_code.py b/sql_migration_assistant/app/similar_code.py → ...l_migration_assistant/app/similar_code.py
@@ -1,5 +1,5 @@
-from databricks.sdk import WorkspaceClient
 from databricks.labs.lsql.core import StatementExecutionExt
+from databricks.sdk import WorkspaceClient
 
 
 class SimilarCode:

diff --git a/sql-migration-assistant/src/sql_migration_assistant/config.py b/sql-migration-assistant/src/sql_migration_assistant/config.py
@@ -0,0 +1,16 @@
+import os
+
+FOUNDATION_MODEL_NAME = os.environ.get("SERVED_FOUNDATION_MODEL_NAME")
+SQL_WAREHOUSE_ID = os.environ.get("DATABRICKS_WAREHOUSE_ID")
+VECTOR_SEARCH_ENDPOINT_NAME = os.environ.get("VECTOR_SEARCH_ENDPOINT_NAME")
+VS_INDEX_NAME = os.environ.get("VS_INDEX_NAME")
+CODE_INTENT_TABLE_NAME = os.environ.get("CODE_INTENT_TABLE_NAME")
+CATALOG = os.environ.get("CATALOG")
+SCHEMA = os.environ.get("SCHEMA")
+VOLUME_NAME = os.environ.get("VOLUME_NAME")
+DATABRICKS_HOST = os.environ.get("DATABRICKS_HOST")
+TRANSFORMATION_JOB_ID = os.environ.get("TRANSFORMATION_JOB_ID")
+WORKSPACE_LOCATION = os.environ.get("WORKSPACE_LOCATION")
+VOLUME_NAME_INPUT_PATH = os.environ.get("VOLUME_NAME_INPUT_PATH")
+PROMPT_HISTORY_TABLE_NAME = os.environ.get("PROMPT_HISTORY_TABLE_NAME")
+DATABRICKS_TOKEN = os.environ.get("DATABRICKS_TOKEN")