diff --git a/.github/workflows/convert_notebooks.py b/.github/workflows/convert_notebooks.py new file mode 100644 index 0000000..b6ee9dd --- /dev/null +++ b/.github/workflows/convert_notebooks.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +""" +Notebook to GitHub-Compatible Format Converter + +This script fixes Jupyter notebooks for GitHub rendering by: +1. Converting XML-format notebooks to standard Jupyter JSON format +2. Cleaning widget metadata that can cause GitHub rendering issues +""" + +import os +import json +import re +import nbformat +from nbformat.validator import validate + +def process_notebooks(directory="."): + """Find and process all notebook files in the repository""" + notebook_files = [] + print(f"Searching for notebooks in directory: {directory}") + for root, dirs, files in os.walk(directory): + # Skip directories that should be excluded + if '.git' in dirs: + dirs.remove('.git') # Skip git directory + if '.github' in dirs: + dirs.remove('.github') # Skip GitHub directory + if '.venv' in dirs: + dirs.remove('.venv') # Skip virtual environments + + for file in files: + if file.endswith('.ipynb'): + notebook_path = os.path.join(root, file) + print(f"Found notebook: {notebook_path}") + notebook_files.append(notebook_path) + + print(f"Found {len(notebook_files)} notebooks to process") + + success_count = 0 + for nb_path in notebook_files: + if convert_notebook(nb_path): + success_count += 1 + + print(f"Successfully rendered {success_count} out of {len(notebook_files)} notebooks") + return success_count + +def convert_notebook(filepath): + """Convert a notebook to GitHub-compatible format by cleaning widget metadata""" + print(f"\nProcessing {filepath}") + + try: + # Read the notebook content + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if this is a XML notebook + if '(.*?)', re.DOTALL) + + matches = list(cell_pattern.finditer(content)) + if not matches: + print(f" WARNING: No cells found in {filepath}") + return False + + print(f" Found {len(matches)} cells") + + for match in matches: + cell_type, cell_content = match.groups() + + if cell_type == "markdown": + cells.append(nbformat.v4.new_markdown_cell( + source=cell_content.strip() + )) + else: # python, javascript, etc. + cells.append(nbformat.v4.new_code_cell( + source=cell_content.strip() + )) + + # Create a new notebook + nb = nbformat.v4.new_notebook() + nb.cells = cells + + # Add required metadata + nb.metadata = { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + # Add empty widget state to prevent GitHub rendering issues + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + } + + # Validate and write the notebook + validate(nb) + with open(filepath, 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + print(f" Successfully rendered {filepath} for GitHub compatibility") + return True + + else: + # It's already in JSON format, clean widget metadata + try: + notebook = json.loads(content) + print(f" Cleaning widget metadata...") + + # Remove potentially problematic widget state but keep proper structure + if 'metadata' in notebook: + # Replace with clean widget state + notebook['metadata']['widgets'] = { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + + # Clean widget metadata from cells as well + for cell in notebook.get('cells', []): + if 'metadata' in cell and 'widgets' in cell['metadata']: + del cell['metadata']['widgets'] + + # Write the cleaned notebook + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(notebook, f, indent=2) + + print(f" Successfully cleaned {filepath} for GitHub compatibility") + return True + + except json.JSONDecodeError: + print(f" ERROR: {filepath} is not in valid JSON format or XML format") + return False + + except Exception as e: + print(f" ERROR processing {filepath}: {str(e)}") + return False + +if __name__ == "__main__": + print("Rendering notebooks for GitHub compatibility...") + # Get the repository root directory from environment variable if available + repo_root = os.environ.get('GITHUB_WORKSPACE', '.') + print(f"Repository root: {repo_root}") + process_notebooks(repo_root) diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml new file mode 100644 index 0000000..0577b6a --- /dev/null +++ b/.github/workflows/render-notebooks.yml @@ -0,0 +1,61 @@ +name: Render Notebooks for GitHub + +on: + push: + paths: + - '**.ipynb' + pull_request: + branches: + - main + workflow_dispatch: # Allows manual triggering + +permissions: + contents: write + pull-requests: write + +jobs: + render-notebooks: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} # Explicitly checkout the branch that triggered the workflow + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nbformat nbconvert jupyter + + - name: Run conversion script + run: python .github/workflows/convert_notebooks.py + + - name: Configure Git + run: | + git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" + + - name: Commit and push changes + run: | + CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) + echo "Current branch: $CURRENT_BRANCH" + + # Stage all notebook files + git add "**/*.ipynb" + + # Check if there are changes to commit + if git diff --staged --quiet; then + echo "No changes detected in notebooks" + else + echo "Changes detected in notebooks" + git commit -m "Fix notebooks for GitHub compatibility" + git push https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git HEAD:${CURRENT_BRANCH} + echo "Successfully pushed changes" + fi diff --git a/0_Overview.md b/0_Overview.md index abb8c66..878d624 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index a7b2f92..e64a298 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index eef1221..baf619f 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -15,7 +15,7 @@ Last updated: 2025-07-16
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb index 04efdcf..03ebd49 100644 --- a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb +++ b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb @@ -1 +1,369 @@ -{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the bronze layer:\n","df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n","\n","df_raw_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":4,"statement_ids":[4],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:11:30.8930212Z","session_start_time":null,"execution_start_time":"2024-10-25T19:14:18.2840528Z","execution_finish_time":"2024-10-25T19:14:24.7554027Z","parent_msg_id":"4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":2,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["# Clean the data (e.g., filter out rows with null values in the 'age' column):\n","df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n","print(df_cleaned)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:21:35.4162864Z","session_start_time":null,"execution_start_time":"2024-10-25T19:21:35.9099818Z","execution_finish_time":"2024-10-25T19:21:36.2079156Z","parent_msg_id":"d65f6fd9-d9ab-4498-ab5d-0710bab459be"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n"]}],"execution_count":8,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n","df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:23:52.3238132Z","session_start_time":null,"execution_start_time":"2024-10-25T19:23:52.7414203Z","execution_finish_time":"2024-10-25T19:24:09.4412514Z","parent_msg_id":"8c92d669-7856-4961-a9d0-c38d54833ee4"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":10,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Bronze layer\n","bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n","# Perform transformations (if any)\n","silver_df = bronze_df # Assuming no transformations for simplicity\n","# Write data to the Silver layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:27:20.1106109Z","session_start_time":null,"execution_start_time":"2024-10-25T19:27:20.5334249Z","execution_finish_time":"2024-10-25T19:27:25.4936309Z","parent_msg_id":"bf665ff4-43d5-4b02-90a6-6c28640576c3"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"widgets":{},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# PySpark Code to Move Data from Bronze to Silver" + ] + }, + { + "cell_type": "markdown", + "id": "44f47922-4e3b-45cc-81a6-c5de97634f73", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "This is an example on how to work with the medallion architecture. From Bronze to Silver" + ] + }, + { + "cell_type": "markdown", + "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Working with 2020orders information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "febb6c3e-6841-42c1-a633-0da056b7f69c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions \n", + "from pyspark.sql import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:14:24.7554027Z", + "execution_start_time": "2024-10-25T19:14:18.2840528Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf", + "queued_time": "2024-10-25T19:11:30.8930212Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 4, + "statement_ids": [ + 4 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n", + " Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the data from the bronze layer:\n", + "df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n", + "\n", + "df_raw_2020orders.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:21:36.2079156Z", + "execution_start_time": "2024-10-25T19:21:35.9099818Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d65f6fd9-d9ab-4498-ab5d-0710bab459be", + "queued_time": "2024-10-25T19:21:35.4162864Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 10, + "statement_ids": [ + 10 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n" + ] + } + ], + "source": [ + "# Clean the data (e.g., filter out rows with null values in the 'age' column):\n", + "df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n", + "print(df_cleaned)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c83d4e46-2b49-490f-aadb-87a350c85e89", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:24:09.4412514Z", + "execution_start_time": "2024-10-25T19:23:52.7414203Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "8c92d669-7856-4961-a9d0-c38d54833ee4", + "queued_time": "2024-10-25T19:23:52.3238132Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 12, + "statement_ids": [ + 12 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n", + "df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")" + ] + }, + { + "cell_type": "markdown", + "id": "f830afb3-2b02-4076-800a-85ca9fc33fea", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Working with products information" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:27:25.4936309Z", + "execution_start_time": "2024-10-25T19:27:20.5334249Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "bf665ff4-43d5-4b02-90a6-6c28640576c3", + "queued_time": "2024-10-25T19:27:20.1106109Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 15, + "statement_ids": [ + 15 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read data from the Bronze layer\n", + "bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n", + "# Perform transformations (if any)\n", + "silver_df = bronze_df # Assuming no transformations for simplicity\n", + "# Write data to the Silver layer\n", + "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")" + ] + } + ], + "metadata": { + "dependencies": { + "lakehouse": { + "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", + "default_lakehouse_name": "raw_Bronze", + "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" + } + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb b/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb index 42bf848..d574aae 100644 --- a/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb +++ b/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb @@ -1 +1,487 @@ -{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Silver to Gold"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Silver to Gold"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders_silver information"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *\n","from pyspark.sql.functions import sum\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:16.4970957Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:16.9788868Z","execution_finish_time":"2024-10-25T20:29:17.2577679Z","parent_msg_id":"f6330200-43db-4e0b-9e85-773c1fa95042"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the silver layer:\n","df_cleansed_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")\n","\n","df_cleansed_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":16,"statement_ids":[16],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:17.7525504Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:18.2199944Z","execution_finish_time":"2024-10-25T20:29:26.3332937Z","parent_msg_id":"901386c6-fa37-4dad-8a8d-34e1276108f8"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 16, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":47,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":14,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["df_cleansed_2020orders = df_cleansed_2020orders.withColumn(\"tax\", df_cleansed_2020orders[\"tax\"].cast(\"int\")) # type to int"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":17,"statement_ids":[17],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.1174684Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:26.7221718Z","execution_finish_time":"2024-10-25T20:29:26.9553583Z","parent_msg_id":"1d52d464-b1ce-40ca-a8b4-ed446b5980fd"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 17, Finished, Available, Finished)"},"metadata":{}}],"execution_count":15,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b1478c38-d56e-4a58-a551-405675f4110d"},{"cell_type":"code","source":["df_cleansed_2020orders.printSchema()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":18,"statement_ids":[18],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.3209761Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:27.3402865Z","execution_finish_time":"2024-10-25T20:29:27.5846334Z","parent_msg_id":"882470dc-dab6-4bef-bf92-1f25c81c4bad"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 18, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["root\n |-- ID: string (nullable = true)\n |-- Count: integer (nullable = true)\n |-- Date: string (nullable = true)\n |-- Name: string (nullable = true)\n |-- Style: string (nullable = true)\n |-- price: double (nullable = true)\n |-- tax: integer (nullable = true)\n\n"]}],"execution_count":16,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f9da346b-2b25-463a-821d-6b8d7bda321e"},{"cell_type":"code","source":["# Group and Aggregate the Data:\n","df_aggregated = df_cleansed_2020orders.groupBy(\"Style\").agg(sum(\"price\").alias(\"total_price_vehicles\"))\n","df_aggregated.show(10, truncate=False)\n","print(df_aggregated)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":19,"statement_ids":[19],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.5521281Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:27.9679539Z","execution_finish_time":"2024-10-25T20:29:34.3035094Z","parent_msg_id":"9a5b30d2-5552-495b-b0a7-c5cfed0bebd8"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 19, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["+-----------------------+--------------------+\n|Style |total_price_vehicles|\n+-----------------------+--------------------+\n|Mountain-200 Black, 42 |196713.42720000003 |\n|Mountain-100 Silver, 42|71399.78999999998 |\n|Mountain-200 Silver, 42|159499.30919999976 |\n|Mountain-100 Silver, 44|71399.78999999998 |\n|Road-550-W Yellow, 40 |37016.1875 |\n|Mountain-100 Silver, 38|64599.80999999997 |\n|Road-250 Red, 48 |256551.75000000044 |\n|Road-250 Red, 52 |217458.15000000034 |\n|Road-650 Red, 52 |20301.81200000001 |\n|Road-250 Black, 52 |253061.25 |\n+-----------------------+--------------------+\nonly showing top 10 rows\n\nDataFrame[Style: string, total_price_vehicles: double]\n"]}],"execution_count":17,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"curated_Gold\" table in the Gold lakehouse:\n","df_aggregated.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_Gold.Lakehouse/Tables/2020orders_gold\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":20,"statement_ids":[20],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.7090964Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:34.7050563Z","execution_finish_time":"2024-10-25T20:29:39.5141044Z","parent_msg_id":"56e18a6c-9106-4200-a6cc-7386c03b93d3"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 20, Finished, Available, Finished)"},"metadata":{}}],"execution_count":18,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products_silver information"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Silver layer\n","silver_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")\n","# Perform transformations (if any)\n","silver_df = silver_df # Assuming no transformations for simplicity\n","# Write data to the Gold layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_gold.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":21,"statement_ids":[21],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:45.1788369Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:45.5790407Z","execution_finish_time":"2024-10-25T20:29:49.090114Z","parent_msg_id":"67830cc7-dbf9-4b47-a9ea-51ce1c29634b"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 21, Finished, Available, Finished)"},"metadata":{}}],"execution_count":19,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"widgets":{},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# PySpark Code to Move Data from Silver to Gold" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13" + }, + { + "cell_type": "markdown", + "source": [ + "This is an example on how to work with the medallion architecture. From Silver to Gold" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "44f47922-4e3b-45cc-81a6-c5de97634f73" + }, + { + "cell_type": "markdown", + "source": [ + "## Working with 2020orders_silver information" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c" + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions \n", + "from pyspark.sql import *\n", + "from pyspark.sql.functions import sum\n" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 15, + "statement_ids": [ + 15 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:16.4970957Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:16.9788868Z", + "execution_finish_time": "2024-10-25T20:29:17.2577679Z", + "parent_msg_id": "f6330200-43db-4e0b-9e85-773c1fa95042" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 15, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 13, + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "febb6c3e-6841-42c1-a633-0da056b7f69c" + }, + { + "cell_type": "code", + "source": [ + "# Read the data from the silver layer:\n", + "df_cleansed_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")\n", + "\n", + "df_cleansed_2020orders.head(2)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 16, + "statement_ids": [ + 16 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:17.7525504Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:18.2199944Z", + "execution_finish_time": "2024-10-25T20:29:26.3332937Z", + "parent_msg_id": "901386c6-fa37-4dad-8a8d-34e1276108f8" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 16, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "execution_count": 47, + "data": { + "text/plain": "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + }, + "metadata": {} + } + ], + "execution_count": 14, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2" + }, + { + "cell_type": "code", + "source": [ + "df_cleansed_2020orders = df_cleansed_2020orders.withColumn(\"tax\", df_cleansed_2020orders[\"tax\"].cast(\"int\")) # type to int" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 17, + "statement_ids": [ + 17 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.1174684Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:26.7221718Z", + "execution_finish_time": "2024-10-25T20:29:26.9553583Z", + "parent_msg_id": "1d52d464-b1ce-40ca-a8b4-ed446b5980fd" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 17, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 15, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "b1478c38-d56e-4a58-a551-405675f4110d" + }, + { + "cell_type": "code", + "source": [ + "df_cleansed_2020orders.printSchema()" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 18, + "statement_ids": [ + 18 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.3209761Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:27.3402865Z", + "execution_finish_time": "2024-10-25T20:29:27.5846334Z", + "parent_msg_id": "882470dc-dab6-4bef-bf92-1f25c81c4bad" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 18, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "root\n |-- ID: string (nullable = true)\n |-- Count: integer (nullable = true)\n |-- Date: string (nullable = true)\n |-- Name: string (nullable = true)\n |-- Style: string (nullable = true)\n |-- price: double (nullable = true)\n |-- tax: integer (nullable = true)\n\n" + ] + } + ], + "execution_count": 16, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "f9da346b-2b25-463a-821d-6b8d7bda321e" + }, + { + "cell_type": "code", + "source": [ + "# Group and Aggregate the Data:\n", + "df_aggregated = df_cleansed_2020orders.groupBy(\"Style\").agg(sum(\"price\").alias(\"total_price_vehicles\"))\n", + "df_aggregated.show(10, truncate=False)\n", + "print(df_aggregated)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 19, + "statement_ids": [ + 19 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.5521281Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:27.9679539Z", + "execution_finish_time": "2024-10-25T20:29:34.3035094Z", + "parent_msg_id": "9a5b30d2-5552-495b-b0a7-c5cfed0bebd8" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 19, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------------------+--------------------+\n|Style |total_price_vehicles|\n+-----------------------+--------------------+\n|Mountain-200 Black, 42 |196713.42720000003 |\n|Mountain-100 Silver, 42|71399.78999999998 |\n|Mountain-200 Silver, 42|159499.30919999976 |\n|Mountain-100 Silver, 44|71399.78999999998 |\n|Road-550-W Yellow, 40 |37016.1875 |\n|Mountain-100 Silver, 38|64599.80999999997 |\n|Road-250 Red, 48 |256551.75000000044 |\n|Road-250 Red, 52 |217458.15000000034 |\n|Road-650 Red, 52 |20301.81200000001 |\n|Road-250 Black, 52 |253061.25 |\n+-----------------------+--------------------+\nonly showing top 10 rows\n\nDataFrame[Style: string, total_price_vehicles: double]\n" + ] + } + ], + "execution_count": 17, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f" + }, + { + "cell_type": "code", + "source": [ + "# Save the cleaned data to the \"curated_Gold\" table in the Gold lakehouse:\n", + "df_aggregated.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_Gold.Lakehouse/Tables/2020orders_gold\")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 20, + "statement_ids": [ + 20 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.7090964Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:34.7050563Z", + "execution_finish_time": "2024-10-25T20:29:39.5141044Z", + "parent_msg_id": "56e18a6c-9106-4200-a6cc-7386c03b93d3" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 20, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 18, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "c83d4e46-2b49-490f-aadb-87a350c85e89" + }, + { + "cell_type": "markdown", + "source": [ + "## Working with products_silver information" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "f830afb3-2b02-4076-800a-85ca9fc33fea" + }, + { + "cell_type": "code", + "source": [ + "# Read data from the Silver layer\n", + "silver_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")\n", + "# Perform transformations (if any)\n", + "silver_df = silver_df # Assuming no transformations for simplicity\n", + "# Write data to the Gold layer\n", + "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_gold.Lakehouse/Tables/products_silver\")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 21, + "statement_ids": [ + 21 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:45.1788369Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:45.5790407Z", + "execution_finish_time": "2024-10-25T20:29:49.090114Z", + "parent_msg_id": "67830cc7-dbf9-4b47-a9ea-51ce1c29634b" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 21, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 19, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382" + } + ], + "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "name": "synapse_pyspark", + "language": "Python", + "display_name": "Synapse PySpark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "dependencies": { + "lakehouse": { + "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", + "default_lakehouse_name": "raw_Bronze", + "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 8967712..909e4b3 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb b/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb index 880187e..a0d1b8e 100644 --- a/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb +++ b/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb @@ -1,1202 +1,1203 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "519955e9-2dad-456d-93db-a332d38e9433", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# Fabric: Highlights into AI/LLMs" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d312e8d9-03fe-4b3d-aa6d-c52e3022ae39", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:26.7170509Z", - "execution_start_time": "2024-10-31T03:58:19.270951Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e267b6ab-5133-4598-8251-d64374cd11e5", - "queued_time": "2024-10-31T03:58:18.9132075Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 5, - "statement_ids": [ - 5 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 5, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: synapseml\r\n", - "Version: 1.0.8\r\n", - "Summary: Synapse Machine Learning\r\n", - "Home-page: https://github.com/Microsoft/SynapseML\r\n", - "Author: Microsoft\r\n", - "Author-email: synapseml-support@microsoft.com\r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: \r\n", - "Required-by: \r\n" - ] - } - ], - "source": [ - "!pip show synapseml" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "427610d0-3fae-45e3-8150-92ee7674f44c", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:28.6254349Z", - "execution_start_time": "2024-10-31T03:58:27.1124616Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "0e9f6c0f-062b-4e5d-9061-afcd89c8fd75", - "queued_time": "2024-10-31T03:58:19.3223486Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 6, - "statement_ids": [ - 6 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 6, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: langchain-openai\r\n", - "Version: 0.2.4\r\n", - "Summary: An integration package connecting OpenAI and LangChain\r\n", - "Home-page: https://github.com/langchain-ai/langchain\r\n", - "Author: \r\n", - "Author-email: \r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: langchain-core, openai, tiktoken\r\n", - "Required-by: \r\n" - ] - } - ], - "source": [ - "!pip show langchain-openai" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "baeeb853-2104-4edf-abf4-4d4be50cb977", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:30.5465258Z", - "execution_start_time": "2024-10-31T03:58:29.0000586Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "716d9975-263b-4d92-b25c-b342106f5f43", - "queued_time": "2024-10-31T03:58:19.511824Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 7, - "statement_ids": [ - 7 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 7, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: langchain\r\n", - "Version: 0.3.6\r\n", - "Summary: Building applications with LLMs through composability\r\n", - "Home-page: https://github.com/langchain-ai/langchain\r\n", - "Author: \r\n", - "Author-email: \r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity\r\n", - "Required-by: langchain-community\r\n" - ] - } - ], - "source": [ - "!pip show langchain" - ] - }, - { - "cell_type": "markdown", - "id": "c58cc406-c4f5-4607-a740-0802e8e4b550", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Ensure you have the API key and endpoint URL for your deployed model. Set these as environment variables" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "3c8ada7c-2632-4c69-86d2-f5260ee8f1b7", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:14.3495341Z", - "execution_start_time": "2024-10-31T04:20:14.1128215Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "2573bf75-fe6d-40dc-b9f6-e06ebb9f7f73", - "queued_time": "2024-10-31T04:20:13.6194485Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 22, - "statement_ids": [ - 22 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 22, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_VERSION\"] = \"2023-08-01-preview\"\n", - "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://your-resource.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview\"\n", - "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"your-value\"" - ] - }, - { - "cell_type": "markdown", - "id": "3fac48a9-45fb-4e86-9792-8ee340b0ac60", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Create an instance of the Azure OpenAI class using the environment variables set above" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "5db10350-8000-4cbd-9bdf-d7da62d7fe61", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:14.9382032Z", - "execution_start_time": "2024-10-31T04:20:14.7083469Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "7dfaca5a-f738-4010-bba1-f764ea70f450", - "queued_time": "2024-10-31T04:20:14.027325Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 23, - "statement_ids": [ - 23 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 23, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain_openai import AzureChatOpenAI\n", - "\n", - "# Set the API base URL\n", - "api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", - "\n", - "# Create an instance of the Azure OpenAI Class\n", - "llm = AzureChatOpenAI(\n", - " openai_api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", - " temperature=0.7,\n", - " verbose=True,\n", - " top_p=0.9\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b17d7450-34b5-4ece-8e20-a77ddcdd93c4", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use the Azure OpenAI service to generate text or perform other language model tasks" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "cfc5fd62-085a-4eff-9192-696d9f249a8e", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:16.0500538Z", - "execution_start_time": "2024-10-31T04:20:15.2936074Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e14e4d0b-1fd0-4dac-a07d-6479d6536ce3", - "queued_time": "2024-10-31T04:20:14.4969185Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 24, - "statement_ids": [ - 24 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 24, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content='Salut, comment ça va ?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 33, 'total_tokens': 39, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_d54531d9eb', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}} id='run-8cb7f29a-44c1-4f65-a648-15afb2d793dc-0' usage_metadata={'input_tokens': 33, 'output_tokens': 6, 'total_tokens': 39, 'input_token_details': {}, 'output_token_details': {}}\n" - ] - } - ], - "source": [ - "# Define a prompt\n", - "messages = [\n", - " (\n", - " \"system\",\n", - " \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n", - " ),\n", - " (\"human\", \"Hi, how are you?\"),\n", - "]\n", - "\n", - "# Generate a response from the Azure OpenAI service using the invoke method\n", - "ai_msg = llm.invoke(messages)\n", - "\n", - "# Print the response\n", - "print(ai_msg)" - ] - }, - { - "cell_type": "markdown", - "id": "79729106-c7f1-4879-bc2b-871b50c2ac9a", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Define a prompt template for generating definitions" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ca633361-c27b-4294-b8a7-9fc4a316afa4", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:16.587491Z", - "execution_start_time": "2024-10-31T04:20:16.3655978Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "cc3215f4-71a5-4231-af47-9bd9a8f5698a", - "queued_time": "2024-10-31T04:20:14.7799392Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 25, - "statement_ids": [ - 25 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 25, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.prompts import PromptTemplate\n", - "\n", - "copy_prompt = PromptTemplate(\n", - " input_variables=[\"technology\"],\n", - " template=\"Define the following word: {technology}\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "899839d9-adca-4042-b662-73edcad7e432", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Create an LLMChain with the defined prompt template" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "bd4f65ca-049b-481d-bbbd-a017c6c0119b", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:17.1233668Z", - "execution_start_time": "2024-10-31T04:20:16.9052959Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "27790d83-509f-4716-bb69-9c288ad069ba", - "queued_time": "2024-10-31T04:20:15.1325692Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 26, - "statement_ids": [ - 26 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 26, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.chains import LLMChain\n", - "\n", - "chain = LLMChain(llm=llm, prompt=copy_prompt)\n" - ] - }, - { - "cell_type": "markdown", - "id": "936b3ddf-cc65-436c-ba4e-ae0abe21fc2c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Set up the LangChain transformer to execute the processing chain\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "63a00038-37b4-49ee-9c53-128c8acf9d01", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:18.181457Z", - "execution_start_time": "2024-10-31T04:20:17.4351576Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "3fb30420-f0c9-477b-ad1a-001dc0d8d37a", - "queued_time": "2024-10-31T04:20:15.6799013Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 27, - "statement_ids": [ - 27 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 27, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from synapse.ml.cognitive.langchain import LangchainTransformer\n", - "\n", - "openai_api_key= os.environ[\"AZURE_OPENAI_API_KEY\"]\n", - "\n", - "transformer = (\n", - " LangchainTransformer()\n", - " .setInputCol(\"technology\")\n", - " .setOutputCol(\"definition\")\n", - " .setChain(chain)\n", - " .setSubscriptionKey(openai_api_key)\n", - " .setUrl(api_base)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c74293f0-925e-4987-a6a1-b3b9b8e14b9d", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Construct a DataFrame with technology names." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8e03963e-2fcf-4934-b96f-ac27b4e0353c", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:24:08.3891172Z", - "execution_start_time": "2024-10-31T04:24:02.0675933Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "856f5b73-26e8-4d20-a901-356cd92b9c2a", - "queued_time": "2024-10-31T04:24:01.6603792Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 29, - "statement_ids": [ - 29 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 29, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----+----------+----------------------+\n", - "|label|technology|transformed_technology|\n", - "+-----+----------+----------------------+\n", - "| 0| docker| DOCKER|\n", - "| 1| spark| SPARK|\n", - "| 2| python| PYTHON|\n", - "+-----+----------+----------------------+\n", - "\n" - ] - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "\n", - "# Initialize Spark session\n", - "spark = SparkSession.builder.appName(\"example\").getOrCreate()\n", - "\n", - "# Construct a DataFrame with technology names\n", - "df = spark.createDataFrame(\n", - " [\n", - " (0, \"docker\"), (1, \"spark\"), (2, \"python\")\n", - " ],\n", - " [\"label\", \"technology\"]\n", - ")\n", - "\n", - "# Define a simple UDF to transform the technology column\n", - "def transform_technology(tech):\n", - " return tech.upper()\n", - "\n", - "# Register the UDF\n", - "transform_udf = udf(transform_technology, StringType())\n", - "\n", - "# Apply the UDF to the DataFrame\n", - "transformed_df = df.withColumn(\"transformed_technology\", transform_udf(df[\"technology\"]))\n", - "\n", - "# Show the transformed DataFrame\n", - "transformed_df.show()" - ] - }, - { - "cell_type": "markdown", - "id": "47ab1ba6-deaf-488d-9e95-8202669d948c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Extract content from PDFs linked in arXiv papers and generate prompts for extracting specific information.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "8b52c87e-5971-4d28-bc4b-4160d29a1c24", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:27:08.3224773Z", - "execution_start_time": "2024-10-31T04:27:08.0430507Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "4eeab690-4159-41dc-be69-3cceed484314", - "queued_time": "2024-10-31T04:27:07.6309068Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 30, - "statement_ids": [ - 30 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 30, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.document_loaders import OnlinePDFLoader\n", - "\n", - "def paper_content_extraction(inputs: dict) -> dict:\n", - " arxiv_link = inputs[\"arxiv_link\"]\n", - " loader = OnlinePDFLoader(arxiv_link)\n", - " pages = loader.load_and_split()\n", - " return {\"paper_content\": pages[0].page_content + pages[1].page_content}\n", - "\n", - "def prompt_generation(inputs: dict) -> dict:\n", - " output = inputs[\"Output\"]\n", - " prompt = (\n", - " \"find the paper title, author, summary in the paper description below, output them. \"\n", - " \"After that, Use websearch to find out 3 recent papers of the first author in the author section below \"\n", - " \"(first author is the first name separated by comma) and list the paper titles in bullet points: \"\n", - " \"\\n\" + output + \".\"\n", - " )\n", - " return {\"prompt\": prompt}" - ] - }, - { - "cell_type": "markdown", - "id": "89d79c38-ba0c-4062-911c-7ede02536298", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Set up a chain to extract structured information from an arXiv link\n" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "e85241a0-11c2-49c1-9b2e-63187cb24d9a", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:28:11.2331925Z", - "execution_start_time": "2024-10-31T04:28:11.0134852Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "232b4aa0-1b84-47f8-bb5d-347a575d9640", - "queued_time": "2024-10-31T04:28:10.663514Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 31, - "statement_ids": [ - 31 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 31, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.chains import TransformChain, SimpleSequentialChain\n", - "\n", - "paper_content_extraction_chain = TransformChain(\n", - " input_variables=[\"arxiv_link\"],\n", - " output_variables=[\"paper_content\"],\n", - " transform=paper_content_extraction,\n", - " verbose=False,\n", - ")\n", - "\n", - "paper_summarizer_template = \"\"\"\n", - "You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, \n", - "and extract authors and paper title from the paper content.\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "64937339-791c-4aad-953b-ca990bfd324a", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use Microsoft Fabric's native integration with the MLflow framework to log the trained machine learning models, the used hyperparameters, and evaluation metrics." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "5bac7684-a123-4733-baa3-a748ff0fd070", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:36:54.8917645Z", - "execution_start_time": "2024-10-31T04:36:44.7561664Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "d2abef17-25d7-41c4-a62f-051d9b5fe8d7", - "queued_time": "2024-10-31T04:36:44.2999954Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 33, - "statement_ids": [ - 33 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 33, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registered model 'RandomForestRegressionModel' already exists. Creating a new version of this model...\n", - "2024/10/31 04:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressionModel, version 2\n", - "Created version '2' of model 'RandomForestRegressionModel'.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Name: RandomForestRegressionModel\n", - "Model Version: 2\n" - ] - }, - { - "data": { - "application/vnd.mlflow.run-widget+json": { - "data": { - "metrics": {}, - "params": { - "n_estimators": "3", - "random_state": "42" - }, - "tags": { - "mlflow.rootRunId": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "mlflow.runName": "icy_hamster_xr34qfzf", - "mlflow.user": "4b3a56ea-6f42-450e-b7c3-fb2932c7ac32", - "synapseml.experiment.artifactId": "17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b", - "synapseml.experimentName": "Notebook-1", - "synapseml.livy.id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "synapseml.notebook.artifactId": "789d5fef-b2a1-409b-996f-0cdb4e748a90", - "synapseml.user.id": "ea5a1fdc-a08c-493a-bce9-8422f28ecd05", - "synapseml.user.name": "System Administrator" - } - }, - "info": { - "artifact_uri": "sds://onelakewestus3.pbidedicated.windows.net/6361aeaa-b63a-44ea-b28f-26db10b31a6c/17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b/20c75f63-d266-40b1-83f7-d9c76fd1f4f4/artifacts", - "end_time": 1730349412, - "experiment_id": "d52403ad-a9c2-41ba-b582-9b8e9a57917e", - "lifecycle_stage": "active", - "run_id": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "run_name": "", - "run_uuid": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "start_time": 1730349405, - "status": "FINISHED", - "user_id": "7ebfac85-3ebb-440f-a743-e52052051f6a" - }, - "inputs": { - "dataset_inputs": [] - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import mlflow\n", - "from mlflow.models import infer_signature\n", - "from sklearn.datasets import make_regression\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "# Generate synthetic regression data\n", - "X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)\n", - "\n", - "# Model parameters\n", - "params = {\"n_estimators\": 3, \"random_state\": 42}\n", - "\n", - "# Model tags for MLflow\n", - "model_tags = {\n", - " \"project_name\": \"grocery-forecasting\",\n", - " \"store_dept\": \"produce\",\n", - " \"team\": \"stores-ml\",\n", - " \"project_quarter\": \"Q3-2023\"\n", - "}\n", - "\n", - "# Log MLflow entities\n", - "with mlflow.start_run() as run:\n", - " # Train the model\n", - " model = RandomForestRegressor(**params).fit(X, y)\n", - "\n", - " # Infer the model signature\n", - " signature = infer_signature(X, model.predict(X))\n", - "\n", - " # Log parameters and the model\n", - " mlflow.log_params(params)\n", - " mlflow.sklearn.log_model(model, artifact_path=\"sklearn-model\", signature=signature)\n", - "\n", - " # Register the model with tags\n", - " model_uri = f\"runs:/{run.info.run_id}/sklearn-model\"\n", - " model_version = mlflow.register_model(model_uri, \"RandomForestRegressionModel\", tags=model_tags)\n", - "\n", - " # Output model registration details\n", - " print(f\"Model Name: {model_version.name}\")\n", - " print(f\"Model Version: {model_version.version}\")" - ] - }, - { - "cell_type": "markdown", - "id": "315ebdcd-e78c-4bc5-93d6-f202d02bddc5", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use MLflow to search among multiple models saved within the workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60e6f7d3-d1ec-4ccc-9745-6c7938d2f4bc", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "from mlflow.tracking import MlflowClient\n", - "\n", - "client = MlflowClient()\n", - "for rm in client.search_registered_models():\n", - " pprint(dict(rm), indent=4)" - ] - } - ], - "metadata": { - "application/vnd.jupyter.widget-state+json": { - "version": "1.0" - }, - "dependencies": { - "environment": { - "environmentId": "766562be-9e21-456c-b270-cac7e4bf8d18", - "workspaceId": "6361aeaa-b63a-44ea-b28f-26db10b31a6c" - } - }, - "kernel_info": { - "name": "synapse_pyspark" - }, - "kernelspec": { - "display_name": "Synapse PySpark", - "language": "Python", - "name": "synapse_pyspark" - }, - "language_info": { - "name": "python" - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark", - "ms_spell_check": { - "ms_spell_check_language": "en" - } - }, - "nteract": { - "version": "nteract-front-end@1.0.0" - }, - "spark_compute": { - "compute_id": "/trident/default", - "session_options": { - "conf": { - "spark.synapse.nbs.session.timeout": "1200000" - } - } - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version": "1.0" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} +{ + "cells": [ + { + "cell_type": "markdown", + "id": "519955e9-2dad-456d-93db-a332d38e9433", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# Fabric: Highlights into AI/LLMs" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d312e8d9-03fe-4b3d-aa6d-c52e3022ae39", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:26.7170509Z", + "execution_start_time": "2024-10-31T03:58:19.270951Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e267b6ab-5133-4598-8251-d64374cd11e5", + "queued_time": "2024-10-31T03:58:18.9132075Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 5, + "statement_ids": [ + 5 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 5, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: synapseml\r\n", + "Version: 1.0.8\r\n", + "Summary: Synapse Machine Learning\r\n", + "Home-page: https://github.com/Microsoft/SynapseML\r\n", + "Author: Microsoft\r\n", + "Author-email: synapseml-support@microsoft.com\r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: \r\n", + "Required-by: \r\n" + ] + } + ], + "source": [ + "!pip show synapseml" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "427610d0-3fae-45e3-8150-92ee7674f44c", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:28.6254349Z", + "execution_start_time": "2024-10-31T03:58:27.1124616Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "0e9f6c0f-062b-4e5d-9061-afcd89c8fd75", + "queued_time": "2024-10-31T03:58:19.3223486Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 6, + "statement_ids": [ + 6 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 6, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: langchain-openai\r\n", + "Version: 0.2.4\r\n", + "Summary: An integration package connecting OpenAI and LangChain\r\n", + "Home-page: https://github.com/langchain-ai/langchain\r\n", + "Author: \r\n", + "Author-email: \r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: langchain-core, openai, tiktoken\r\n", + "Required-by: \r\n" + ] + } + ], + "source": [ + "!pip show langchain-openai" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "baeeb853-2104-4edf-abf4-4d4be50cb977", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:30.5465258Z", + "execution_start_time": "2024-10-31T03:58:29.0000586Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "716d9975-263b-4d92-b25c-b342106f5f43", + "queued_time": "2024-10-31T03:58:19.511824Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 7, + "statement_ids": [ + 7 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 7, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: langchain\r\n", + "Version: 0.3.6\r\n", + "Summary: Building applications with LLMs through composability\r\n", + "Home-page: https://github.com/langchain-ai/langchain\r\n", + "Author: \r\n", + "Author-email: \r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity\r\n", + "Required-by: langchain-community\r\n" + ] + } + ], + "source": [ + "!pip show langchain" + ] + }, + { + "cell_type": "markdown", + "id": "c58cc406-c4f5-4607-a740-0802e8e4b550", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Ensure you have the API key and endpoint URL for your deployed model. Set these as environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3c8ada7c-2632-4c69-86d2-f5260ee8f1b7", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:14.3495341Z", + "execution_start_time": "2024-10-31T04:20:14.1128215Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "2573bf75-fe6d-40dc-b9f6-e06ebb9f7f73", + "queued_time": "2024-10-31T04:20:13.6194485Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 22, + "statement_ids": [ + 22 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 22, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_VERSION\"] = \"2023-08-01-preview\"\n", + "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://your-resource.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview\"\n", + "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"your-value\"" + ] + }, + { + "cell_type": "markdown", + "id": "3fac48a9-45fb-4e86-9792-8ee340b0ac60", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create an instance of the Azure OpenAI class using the environment variables set above" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5db10350-8000-4cbd-9bdf-d7da62d7fe61", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:14.9382032Z", + "execution_start_time": "2024-10-31T04:20:14.7083469Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "7dfaca5a-f738-4010-bba1-f764ea70f450", + "queued_time": "2024-10-31T04:20:14.027325Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 23, + "statement_ids": [ + 23 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 23, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain_openai import AzureChatOpenAI\n", + "\n", + "# Set the API base URL\n", + "api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", + "\n", + "# Create an instance of the Azure OpenAI Class\n", + "llm = AzureChatOpenAI(\n", + " openai_api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", + " temperature=0.7,\n", + " verbose=True,\n", + " top_p=0.9\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b17d7450-34b5-4ece-8e20-a77ddcdd93c4", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use the Azure OpenAI service to generate text or perform other language model tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cfc5fd62-085a-4eff-9192-696d9f249a8e", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:16.0500538Z", + "execution_start_time": "2024-10-31T04:20:15.2936074Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e14e4d0b-1fd0-4dac-a07d-6479d6536ce3", + "queued_time": "2024-10-31T04:20:14.4969185Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 24, + "statement_ids": [ + 24 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 24, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "content='Salut, comment \u00e7a va ?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 33, 'total_tokens': 39, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_d54531d9eb', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}} id='run-8cb7f29a-44c1-4f65-a648-15afb2d793dc-0' usage_metadata={'input_tokens': 33, 'output_tokens': 6, 'total_tokens': 39, 'input_token_details': {}, 'output_token_details': {}}\n" + ] + } + ], + "source": [ + "# Define a prompt\n", + "messages = [\n", + " (\n", + " \"system\",\n", + " \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n", + " ),\n", + " (\"human\", \"Hi, how are you?\"),\n", + "]\n", + "\n", + "# Generate a response from the Azure OpenAI service using the invoke method\n", + "ai_msg = llm.invoke(messages)\n", + "\n", + "# Print the response\n", + "print(ai_msg)" + ] + }, + { + "cell_type": "markdown", + "id": "79729106-c7f1-4879-bc2b-871b50c2ac9a", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Define a prompt template for generating definitions" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ca633361-c27b-4294-b8a7-9fc4a316afa4", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:16.587491Z", + "execution_start_time": "2024-10-31T04:20:16.3655978Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "cc3215f4-71a5-4231-af47-9bd9a8f5698a", + "queued_time": "2024-10-31T04:20:14.7799392Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 25, + "statement_ids": [ + 25 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 25, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "copy_prompt = PromptTemplate(\n", + " input_variables=[\"technology\"],\n", + " template=\"Define the following word: {technology}\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "899839d9-adca-4042-b662-73edcad7e432", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create an LLMChain with the defined prompt template" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bd4f65ca-049b-481d-bbbd-a017c6c0119b", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:17.1233668Z", + "execution_start_time": "2024-10-31T04:20:16.9052959Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "27790d83-509f-4716-bb69-9c288ad069ba", + "queued_time": "2024-10-31T04:20:15.1325692Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 26, + "statement_ids": [ + 26 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 26, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.chains import LLMChain\n", + "\n", + "chain = LLMChain(llm=llm, prompt=copy_prompt)\n" + ] + }, + { + "cell_type": "markdown", + "id": "936b3ddf-cc65-436c-ba4e-ae0abe21fc2c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Set up the LangChain transformer to execute the processing chain\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "63a00038-37b4-49ee-9c53-128c8acf9d01", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:18.181457Z", + "execution_start_time": "2024-10-31T04:20:17.4351576Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "3fb30420-f0c9-477b-ad1a-001dc0d8d37a", + "queued_time": "2024-10-31T04:20:15.6799013Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 27, + "statement_ids": [ + 27 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 27, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from synapse.ml.cognitive.langchain import LangchainTransformer\n", + "\n", + "openai_api_key= os.environ[\"AZURE_OPENAI_API_KEY\"]\n", + "\n", + "transformer = (\n", + " LangchainTransformer()\n", + " .setInputCol(\"technology\")\n", + " .setOutputCol(\"definition\")\n", + " .setChain(chain)\n", + " .setSubscriptionKey(openai_api_key)\n", + " .setUrl(api_base)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c74293f0-925e-4987-a6a1-b3b9b8e14b9d", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Construct a DataFrame with technology names." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8e03963e-2fcf-4934-b96f-ac27b4e0353c", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:24:08.3891172Z", + "execution_start_time": "2024-10-31T04:24:02.0675933Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "856f5b73-26e8-4d20-a901-356cd92b9c2a", + "queued_time": "2024-10-31T04:24:01.6603792Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 29, + "statement_ids": [ + 29 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 29, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+----------+----------------------+\n", + "|label|technology|transformed_technology|\n", + "+-----+----------+----------------------+\n", + "| 0| docker| DOCKER|\n", + "| 1| spark| SPARK|\n", + "| 2| python| PYTHON|\n", + "+-----+----------+----------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "\n", + "# Initialize Spark session\n", + "spark = SparkSession.builder.appName(\"example\").getOrCreate()\n", + "\n", + "# Construct a DataFrame with technology names\n", + "df = spark.createDataFrame(\n", + " [\n", + " (0, \"docker\"), (1, \"spark\"), (2, \"python\")\n", + " ],\n", + " [\"label\", \"technology\"]\n", + ")\n", + "\n", + "# Define a simple UDF to transform the technology column\n", + "def transform_technology(tech):\n", + " return tech.upper()\n", + "\n", + "# Register the UDF\n", + "transform_udf = udf(transform_technology, StringType())\n", + "\n", + "# Apply the UDF to the DataFrame\n", + "transformed_df = df.withColumn(\"transformed_technology\", transform_udf(df[\"technology\"]))\n", + "\n", + "# Show the transformed DataFrame\n", + "transformed_df.show()" + ] + }, + { + "cell_type": "markdown", + "id": "47ab1ba6-deaf-488d-9e95-8202669d948c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Extract content from PDFs linked in arXiv papers and generate prompts for extracting specific information.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8b52c87e-5971-4d28-bc4b-4160d29a1c24", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:27:08.3224773Z", + "execution_start_time": "2024-10-31T04:27:08.0430507Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "4eeab690-4159-41dc-be69-3cceed484314", + "queued_time": "2024-10-31T04:27:07.6309068Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 30, + "statement_ids": [ + 30 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 30, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.document_loaders import OnlinePDFLoader\n", + "\n", + "def paper_content_extraction(inputs: dict) -> dict:\n", + " arxiv_link = inputs[\"arxiv_link\"]\n", + " loader = OnlinePDFLoader(arxiv_link)\n", + " pages = loader.load_and_split()\n", + " return {\"paper_content\": pages[0].page_content + pages[1].page_content}\n", + "\n", + "def prompt_generation(inputs: dict) -> dict:\n", + " output = inputs[\"Output\"]\n", + " prompt = (\n", + " \"find the paper title, author, summary in the paper description below, output them. \"\n", + " \"After that, Use websearch to find out 3 recent papers of the first author in the author section below \"\n", + " \"(first author is the first name separated by comma) and list the paper titles in bullet points: \"\n", + " \"\\n\" + output + \".\"\n", + " )\n", + " return {\"prompt\": prompt}" + ] + }, + { + "cell_type": "markdown", + "id": "89d79c38-ba0c-4062-911c-7ede02536298", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Set up a chain to extract structured information from an arXiv link\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e85241a0-11c2-49c1-9b2e-63187cb24d9a", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:28:11.2331925Z", + "execution_start_time": "2024-10-31T04:28:11.0134852Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "232b4aa0-1b84-47f8-bb5d-347a575d9640", + "queued_time": "2024-10-31T04:28:10.663514Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 31, + "statement_ids": [ + 31 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 31, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.chains import TransformChain, SimpleSequentialChain\n", + "\n", + "paper_content_extraction_chain = TransformChain(\n", + " input_variables=[\"arxiv_link\"],\n", + " output_variables=[\"paper_content\"],\n", + " transform=paper_content_extraction,\n", + " verbose=False,\n", + ")\n", + "\n", + "paper_summarizer_template = \"\"\"\n", + "You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, \n", + "and extract authors and paper title from the paper content.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "64937339-791c-4aad-953b-ca990bfd324a", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use Microsoft Fabric's native integration with the MLflow framework to log the trained machine learning models, the used hyperparameters, and evaluation metrics." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5bac7684-a123-4733-baa3-a748ff0fd070", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:36:54.8917645Z", + "execution_start_time": "2024-10-31T04:36:44.7561664Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d2abef17-25d7-41c4-a62f-051d9b5fe8d7", + "queued_time": "2024-10-31T04:36:44.2999954Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 33, + "statement_ids": [ + 33 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 33, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'RandomForestRegressionModel' already exists. Creating a new version of this model...\n", + "2024/10/31 04:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressionModel, version 2\n", + "Created version '2' of model 'RandomForestRegressionModel'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Name: RandomForestRegressionModel\n", + "Model Version: 2\n" + ] + }, + { + "data": { + "application/vnd.mlflow.run-widget+json": { + "data": { + "metrics": {}, + "params": { + "n_estimators": "3", + "random_state": "42" + }, + "tags": { + "mlflow.rootRunId": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "mlflow.runName": "icy_hamster_xr34qfzf", + "mlflow.user": "4b3a56ea-6f42-450e-b7c3-fb2932c7ac32", + "synapseml.experiment.artifactId": "17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b", + "synapseml.experimentName": "Notebook-1", + "synapseml.livy.id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "synapseml.notebook.artifactId": "789d5fef-b2a1-409b-996f-0cdb4e748a90", + "synapseml.user.id": "ea5a1fdc-a08c-493a-bce9-8422f28ecd05", + "synapseml.user.name": "System Administrator" + } + }, + "info": { + "artifact_uri": "sds://onelakewestus3.pbidedicated.windows.net/6361aeaa-b63a-44ea-b28f-26db10b31a6c/17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b/20c75f63-d266-40b1-83f7-d9c76fd1f4f4/artifacts", + "end_time": 1730349412, + "experiment_id": "d52403ad-a9c2-41ba-b582-9b8e9a57917e", + "lifecycle_stage": "active", + "run_id": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "run_name": "", + "run_uuid": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "start_time": 1730349405, + "status": "FINISHED", + "user_id": "7ebfac85-3ebb-440f-a743-e52052051f6a" + }, + "inputs": { + "dataset_inputs": [] + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "# Generate synthetic regression data\n", + "X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)\n", + "\n", + "# Model parameters\n", + "params = {\"n_estimators\": 3, \"random_state\": 42}\n", + "\n", + "# Model tags for MLflow\n", + "model_tags = {\n", + " \"project_name\": \"grocery-forecasting\",\n", + " \"store_dept\": \"produce\",\n", + " \"team\": \"stores-ml\",\n", + " \"project_quarter\": \"Q3-2023\"\n", + "}\n", + "\n", + "# Log MLflow entities\n", + "with mlflow.start_run() as run:\n", + " # Train the model\n", + " model = RandomForestRegressor(**params).fit(X, y)\n", + "\n", + " # Infer the model signature\n", + " signature = infer_signature(X, model.predict(X))\n", + "\n", + " # Log parameters and the model\n", + " mlflow.log_params(params)\n", + " mlflow.sklearn.log_model(model, artifact_path=\"sklearn-model\", signature=signature)\n", + "\n", + " # Register the model with tags\n", + " model_uri = f\"runs:/{run.info.run_id}/sklearn-model\"\n", + " model_version = mlflow.register_model(model_uri, \"RandomForestRegressionModel\", tags=model_tags)\n", + "\n", + " # Output model registration details\n", + " print(f\"Model Name: {model_version.name}\")\n", + " print(f\"Model Version: {model_version.version}\")" + ] + }, + { + "cell_type": "markdown", + "id": "315ebdcd-e78c-4bc5-93d6-f202d02bddc5", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use MLflow to search among multiple models saved within the workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60e6f7d3-d1ec-4ccc-9745-6c7938d2f4bc", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "from mlflow.tracking import MlflowClient\n", + "\n", + "client = MlflowClient()\n", + "for rm in client.search_registered_models():\n", + " pprint(dict(rm), indent=4)" + ] + } + ], + "metadata": { + "application/vnd.jupyter.widget-state+json": { + "version": "1.0" + }, + "dependencies": { + "environment": { + "environmentId": "766562be-9e21-456c-b270-cac7e4bf8d18", + "workspaceId": "6361aeaa-b63a-44ea-b28f-26db10b31a6c" + } + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index a2ac1f2..aaecd13 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ---------- @@ -99,7 +99,7 @@ Key Features:
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index 180585a..058bccc 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 7abe2d4..435e6e0 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -14,7 +14,7 @@ Last updated: 2025-07-16
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index 9c754d5..f6f3151 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -6,7 +6,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ---------- @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index cf4c702..f72cdce 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -24,7 +24,6 @@ Last updated: 2025-07-16 - For this workshop, you can set up your infrastructure using either of the following approaches: 1. [Infrastructure via Azure Portal](./AzurePortal/): This approach involves creating the infrastructure and performing `all necessary steps through the Azure Portal` and its resources interface. @@ -72,7 +71,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index a31bab3..3dff5c2 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index 687daea..2d4994c 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11