From eab561a3eea26cfff5554e2f73f8dd0abbe1357b Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 10:53:35 -0600 Subject: [PATCH 01/22] Add workflow to format Jupyter notebooks for GitHub This workflow formats Jupyter notebooks for GitHub compatibility by converting VS Code XML format to standard JSON, adding missing widget state metadata, and ensuring compliance with GitHub's requirements. --- .github/workflows/render-notebooks.yml | 184 +++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 .github/workflows/render-notebooks.yml diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml new file mode 100644 index 0000000..e20cec2 --- /dev/null +++ b/.github/workflows/render-notebooks.yml @@ -0,0 +1,184 @@ +name: Format Notebook for GitHub + +on: + push: + paths: + - '**.ipynb' + pull_request: + branches: + - main + workflow_dispatch: # Allows manual triggering + +permissions: + contents: write + pull-requests: write + +jobs: + format-notebooks: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install nbformat nbconvert jupyter + + - name: Check and fix notebook format + run: | + # Create a script to fix notebooks + cat > fix_notebooks.py << 'EOF' + import os + import json + import nbformat + from nbformat.validator import validate + + # Find all notebook files + notebook_files = [] + for root, dirs, files in os.walk('.'): + if '.git' in dirs: + dirs.remove('.git') + for file in files: + if file.endswith('.ipynb'): + notebook_files.append(os.path.join(root, file)) + + print(f"Found {len(notebook_files)} notebooks to process") + + # Process each notebook + for nb_path in notebook_files: + print(f"Processing {nb_path}") + try: + # Read the notebook + with open(nb_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Handle VS Code XML format + if '(.*?)', re.DOTALL) + for match in cell_pattern.finditer(content): + cell_type, cell_content = match.groups() + + if cell_type == "markdown": + cells.append(nbformat.v4.new_markdown_cell(cell_content.strip())) + elif cell_type in ["python", "javascript", "java", "typescript"]: + cells.append(nbformat.v4.new_code_cell(cell_content.strip())) + + # Create a new notebook + nb = nbformat.v4.new_notebook() + nb.cells = cells + + # Add metadata + nb.metadata = { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + } + else: + # Standard JSON format notebook + nb = nbformat.reads(content, as_version=4) + + # Ensure widget state exists if needed + if "widgets" in str(content): + print(f" Adding widget state metadata...") + if "metadata" not in nb: + nb["metadata"] = {} + if "widgets" not in nb["metadata"]: + nb.metadata["widgets"] = { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + + # Validate the notebook + validate(nb) + + # Write the fixed notebook + with open(nb_path, 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + print(f" Successfully processed {nb_path}") + + except Exception as e: + print(f" Error processing {nb_path}: {str(e)}") + continue + EOF + + # Run the notebook fixing script + python fix_notebooks.py + + - name: Configure Git + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Commit changes (if any) + run: | + git add "*.ipynb" + git commit -m "Fix notebook format for GitHub compatibility" || echo "No changes to commit" + + - name: Push changes (PR) + if: github.event_name == 'pull_request' + env: + TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git fetch origin + git checkout -b ${{ github.event.pull_request.head.ref }} origin/${{ github.event.pull_request.head.ref }} + git pull --rebase origin ${{ github.event.pull_request.head.ref }} || echo "No rebase needed" + git push origin HEAD:${{ github.event.pull_request.head.ref }} + + - name: Push changes (non-PR) + if: github.event_name != 'pull_request' + env: + TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} + git push || echo "No changes to push" + + - name: Create Pull Request (non-PR) + if: github.event_name != 'pull_request' + uses: peter-evans/create-pull-request@v6 + with: + token: ${{ secrets.GITHUB_TOKEN }} + branch: fix-notebook-format + title: "Fix notebook format for GitHub compatibility" + body: | + This PR fixes Jupyter notebook formatting issues to ensure proper rendering on GitHub. + + The workflow addresses: + - Converting VS Code XML format to standard Jupyter JSON format + - Adding missing widget state metadata + - Ensuring notebook format complies with GitHub's requirements + + These changes allow notebooks to render properly in GitHub's notebook viewer. + base: main From e621ca01595eea6fa3ca78b9b1c5d326fb7a67f3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 16:53:57 +0000 Subject: [PATCH 02/22] Update visitor count --- 0_Overview.md | 4 ++-- AzurePortal/1_MedallionArch/README.md | 4 ++-- AzurePortal/1_MedallionArch/docs/README.md | 4 ++-- AzurePortal/2_AI_LLMs/README.md | 4 ++-- AzurePortal/3_AISkills.md | 4 ++-- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 4 ++-- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 4 ++-- AzurePortal/4_CICD/1_github-integration.md | 4 ++-- README.md | 4 ++-- Terraform/README.md | 4 ++-- Terraform/troubleshooting.md | 4 ++-- 11 files changed, 22 insertions(+), 22 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index abb8c66..af60cc1 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index a7b2f92..d3f50df 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index eef1221..16d60f3 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -15,7 +15,7 @@ Last updated: 2025-07-16
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 8967712..078d3ea 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index a2ac1f2..7db38f7 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -99,7 +99,7 @@ Key Features:
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index 180585a..74c2dd1 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 7abe2d4..b11f1b6 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -14,7 +14,7 @@ Last updated: 2025-07-16
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index 9c754d5..40f5eb6 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index cf4c702..d07db18 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index a31bab3..d0eeaad 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index 687daea..be9c388 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views -

Refresh Date: 2025-07-16

+ Total views +

Refresh Date: 2025-09-11

From d7256548e492829b292df65ddf42f0c807b11c99 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:02:15 -0600 Subject: [PATCH 03/22] Create render_notebooks.py --- .github/workflows/render_notebooks.py | 203 ++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) create mode 100644 .github/workflows/render_notebooks.py diff --git a/.github/workflows/render_notebooks.py b/.github/workflows/render_notebooks.py new file mode 100644 index 0000000..ed674cd --- /dev/null +++ b/.github/workflows/render_notebooks.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Notebook to GitHub-Compatible Format Converter + +This script renders XML-format notebooks to standard Jupyter JSON format +with the required widget state metadata for GitHub rendering. +""" + +import os +import json +import re +import nbformat +from nbformat.validator import validate + +def process_notebooks(directory="."): + """Find and process all notebook files in the repository""" + notebook_files = [] + for root, dirs, files in os.walk(directory): + if '.git' in dirs: + dirs.remove('.git') # Skip git directory + if '.github' in dirs: + dirs.remove('.github') # Skip GitHub directory + for file in files: + if file.endswith('.ipynb'): + notebook_files.append(os.path.join(root, file)) + + print(f"Found {len(notebook_files)} notebooks to process") + + success_count = 0 + for nb_path in notebook_files: + if convert_notebook(nb_path): + success_count += 1 + + print(f"Successfully rendered {success_count} out of {len(notebook_files)} notebooks") + return success_count + +def convert_notebook(filepath): + """Convert a XML notebook to standard Jupyter JSON format""" + print(f"\nProcessing {filepath}") + + try: + # Read the notebook content + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if this is a XML notebook + if '(.*?)', re.DOTALL) + + for match in cell_pattern.finditer(content): + cell_type, cell_content = match.groups() + + if cell_type == "markdown": + cells.append(nbformat.v4.new_markdown_cell( + source=cell_content.strip() + )) + else: # python, javascript, etc. + cells.append(nbformat.v4.new_code_cell( + source=cell_content.strip() + )) + + # Create a new notebook + nb = nbformat.v4.new_notebook() + nb.cells = cells + + # Add required metadata + nb.metadata = { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + } + + # Validate and write the notebook + validate(nb) + with open(filepath, 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + print(f" Successfully rendered {filepath} for GitHub compatibility") + return True + + else: + # It's already in JSON format, check if it has widget state + try: + nb_dict = json.loads(content) + + # Check if we need to add widget state metadata + if "widgets" not in nb_dict.get("metadata", {}): + print(f" Adding widget state metadata to JSON notebook...") + nb = nbformat.reads(content, as_version=4) + if "metadata" not in nb: + nb.metadata = {} + nb.metadata["widgets"] = { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + + # Validate and write the notebook + validate(nb) + with open(filepath, 'w', encoding='utf-8') as f: + nbformat.write(nb, f) + + print(f" Successfully added widget state to {filepath}") + return True + else: + print(f" Notebook already in correct format for GitHub, no changes needed") + return True + except json.JSONDecodeError: + print(f" ERROR: {filepath} is not in valid JSON format or XML format") + return False + + except Exception as e: + print(f" ERROR processing {filepath}: {str(e)}") + return False + +def verify_notebooks(directory="."): + """Check all notebooks are in valid Jupyter format for GitHub""" + notebook_files = [] + for root, dirs, files in os.walk(directory): + if '.git' in dirs: + dirs.remove('.git') + if '.github' in dirs: + dirs.remove('.github') + for file in files: + if file.endswith('.ipynb'): + notebook_files.append(os.path.join(root, file)) + + print(f"\nVerifying {len(notebook_files)} notebooks for GitHub compatibility") + + errors = 0 + for nb_path in notebook_files: + print(f"Checking {nb_path}") + try: + with open(nb_path, 'r', encoding='utf-8') as f: + content = f.read() + + # Check if it's still in XML format + if ' 0: + print(f"\n{errors} notebooks may have issues with GitHub rendering") + else: + print("\nAll notebooks are properly formatted for GitHub rendering") + + return errors + +if __name__ == "__main__": + print("Rendering notebooks for GitHub compatibility...") + process_notebooks() + verify_notebooks() From 409444f5f2e42a1e2d08f54ceb0671c66beb5a1d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 17:02:26 +0000 Subject: [PATCH 04/22] Update last modified date in Markdown files --- 0_Overview.md | 2 +- AzurePortal/1_MedallionArch/README.md | 2 +- AzurePortal/1_MedallionArch/docs/README.md | 2 +- AzurePortal/2_AI_LLMs/README.md | 2 +- AzurePortal/3_AISkills.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 2 +- AzurePortal/4_CICD/1_github-integration.md | 2 +- README.md | 2 +- Terraform/README.md | 2 +- Terraform/troubleshooting.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index af60cc1..004364c 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index d3f50df..38cfdd5 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index 16d60f3..0b1d77a 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 078d3ea..66f8921 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index 7db38f7..b52cc6b 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ---------- diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index 74c2dd1..c98acca 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index b11f1b6..1f2fe3c 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index 40f5eb6..6ec3d0f 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -6,7 +6,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ---------- diff --git a/README.md b/README.md index d07db18..e163bf2 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/Terraform/README.md b/Terraform/README.md index d0eeaad..a513e3a 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index be9c388..cfdedf9 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-07-16 +Last updated: 2025-09-11 ------------------------------------------ From 197cde275e7dab6b240ff532cc3415fb1965dd60 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:02:56 -0600 Subject: [PATCH 05/22] Render notebooks for GitHub compatibility Updated workflow to render Jupyter notebooks for GitHub compatibility, including handling VS Code format and adding widget state metadata. --- .github/workflows/render-notebooks.yml | 172 ++++--------------------- 1 file changed, 24 insertions(+), 148 deletions(-) diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml index e20cec2..195daf8 100644 --- a/.github/workflows/render-notebooks.yml +++ b/.github/workflows/render-notebooks.yml @@ -1,12 +1,12 @@ -name: Format Notebook for GitHub +name: Render for GitHub on: - push: - paths: - - '**.ipynb' pull_request: branches: - main + push: + paths: + - '**.ipynb' workflow_dispatch: # Allows manual triggering permissions: @@ -14,7 +14,7 @@ permissions: pull-requests: write jobs: - format-notebooks: + render-notebooks: runs-on: ubuntu-latest steps: @@ -26,159 +26,35 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.x' - name: Install dependencies run: | python -m pip install --upgrade pip pip install nbformat nbconvert jupyter - - name: Check and fix notebook format - run: | - # Create a script to fix notebooks - cat > fix_notebooks.py << 'EOF' - import os - import json - import nbformat - from nbformat.validator import validate - - # Find all notebook files - notebook_files = [] - for root, dirs, files in os.walk('.'): - if '.git' in dirs: - dirs.remove('.git') - for file in files: - if file.endswith('.ipynb'): - notebook_files.append(os.path.join(root, file)) - - print(f"Found {len(notebook_files)} notebooks to process") - - # Process each notebook - for nb_path in notebook_files: - print(f"Processing {nb_path}") - try: - # Read the notebook - with open(nb_path, 'r', encoding='utf-8') as f: - content = f.read() - - # Handle VS Code XML format - if '(.*?)', re.DOTALL) - for match in cell_pattern.finditer(content): - cell_type, cell_content = match.groups() - - if cell_type == "markdown": - cells.append(nbformat.v4.new_markdown_cell(cell_content.strip())) - elif cell_type in ["python", "javascript", "java", "typescript"]: - cells.append(nbformat.v4.new_code_cell(cell_content.strip())) - - # Create a new notebook - nb = nbformat.v4.new_notebook() - nb.cells = cells - - # Add metadata - nb.metadata = { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - } - else: - # Standard JSON format notebook - nb = nbformat.reads(content, as_version=4) - - # Ensure widget state exists if needed - if "widgets" in str(content): - print(f" Adding widget state metadata...") - if "metadata" not in nb: - nb["metadata"] = {} - if "widgets" not in nb["metadata"]: - nb.metadata["widgets"] = { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - - # Validate the notebook - validate(nb) - - # Write the fixed notebook - with open(nb_path, 'w', encoding='utf-8') as f: - nbformat.write(nb, f) - - print(f" Successfully processed {nb_path}") - - except Exception as e: - print(f" Error processing {nb_path}: {str(e)}") - continue - EOF - - # Run the notebook fixing script - python fix_notebooks.py - - - name: Configure Git + - name: Configure Git run: | - git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" + git config --global user.name "github-actions[bot]" - - name: Commit changes (if any) - run: | - git add "*.ipynb" - git commit -m "Fix notebook format for GitHub compatibility" || echo "No changes to commit" - - - name: Push changes (PR) - if: github.event_name == 'pull_request' - env: - TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - git fetch origin - git checkout -b ${{ github.event.pull_request.head.ref }} origin/${{ github.event.pull_request.head.ref }} - git pull --rebase origin ${{ github.event.pull_request.head.ref }} || echo "No rebase needed" - git push origin HEAD:${{ github.event.pull_request.head.ref }} + - name: Render notebooks for GitHub + run: python .github/workflows/render_notebooks.py - - name: Push changes (non-PR) - if: github.event_name != 'pull_request' + - name: Commit changes env: TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} - git push || echo "No changes to push" - - - name: Create Pull Request (non-PR) - if: github.event_name != 'pull_request' - uses: peter-evans/create-pull-request@v6 - with: - token: ${{ secrets.GITHUB_TOKEN }} - branch: fix-notebook-format - title: "Fix notebook format for GitHub compatibility" - body: | - This PR fixes Jupyter notebook formatting issues to ensure proper rendering on GitHub. - - The workflow addresses: - - Converting VS Code XML format to standard Jupyter JSON format - - Adding missing widget state metadata - - Ensuring notebook format complies with GitHub's requirements - - These changes allow notebooks to render properly in GitHub's notebook viewer. - base: main + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + git fetch origin ${{ github.event.pull_request.head.ref }} + git pull --rebase origin ${{ github.event.pull_request.head.ref }} || echo "No rebase needed" + git add "**/*.ipynb" + git commit -m "Render notebooks for GitHub compatibility" || echo "No changes to commit" + git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} + git push origin HEAD:${{ github.event.pull_request.head.ref }} + else + git add "**/*.ipynb" + git commit -m "Render notebooks for GitHub compatibility" || echo "No changes to commit" + git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} + git push origin ${{ github.ref_name }} + fi From b54873b9e12e8066a99bf47fee9d37a92eabcb82 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:20:14 -0600 Subject: [PATCH 06/22] Eliminate backup creation in XML conversion --- ...nder_notebooks.py => convert_notebooks.py} | 68 +++---------------- 1 file changed, 8 insertions(+), 60 deletions(-) rename .github/workflows/{render_notebooks.py => convert_notebooks.py} (71%) diff --git a/.github/workflows/render_notebooks.py b/.github/workflows/convert_notebooks.py similarity index 71% rename from .github/workflows/render_notebooks.py rename to .github/workflows/convert_notebooks.py index ed674cd..490ffe3 100644 --- a/.github/workflows/render_notebooks.py +++ b/.github/workflows/convert_notebooks.py @@ -50,7 +50,14 @@ def convert_notebook(filepath): cells = [] cell_pattern = re.compile(r'(.*?)', re.DOTALL) - for match in cell_pattern.finditer(content): + matches = list(cell_pattern.finditer(content)) + if not matches: + print(f" WARNING: No cells found in {filepath}") + return False + + print(f" Found {len(matches)} cells") + + for match in matches: cell_type, cell_content = match.groups() if cell_type == "markdown": @@ -139,65 +146,6 @@ def convert_notebook(filepath): print(f" ERROR processing {filepath}: {str(e)}") return False -def verify_notebooks(directory="."): - """Check all notebooks are in valid Jupyter format for GitHub""" - notebook_files = [] - for root, dirs, files in os.walk(directory): - if '.git' in dirs: - dirs.remove('.git') - if '.github' in dirs: - dirs.remove('.github') - for file in files: - if file.endswith('.ipynb'): - notebook_files.append(os.path.join(root, file)) - - print(f"\nVerifying {len(notebook_files)} notebooks for GitHub compatibility") - - errors = 0 - for nb_path in notebook_files: - print(f"Checking {nb_path}") - try: - with open(nb_path, 'r', encoding='utf-8') as f: - content = f.read() - - # Check if it's still in XML format - if ' 0: - print(f"\n{errors} notebooks may have issues with GitHub rendering") - else: - print("\nAll notebooks are properly formatted for GitHub rendering") - - return errors - if __name__ == "__main__": print("Rendering notebooks for GitHub compatibility...") process_notebooks() - verify_notebooks() From 330f99b23073ea3156fb7b36ea9a2e05867d79ef Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:21:24 -0600 Subject: [PATCH 07/22] Refactor notebook rendering workflow Updated workflow to render notebooks for GitHub, added checks for changes, and improved Python version specification. --- .github/workflows/render-notebooks.yml | 53 +++++++++++++++----------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml index 195daf8..228ff53 100644 --- a/.github/workflows/render-notebooks.yml +++ b/.github/workflows/render-notebooks.yml @@ -1,12 +1,12 @@ -name: Render for GitHub +name: Render Notebooks for GitHub on: - pull_request: - branches: - - main push: paths: - '**.ipynb' + pull_request: + branches: + - main workflow_dispatch: # Allows manual triggering permissions: @@ -22,39 +22,46 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.head_ref || github.ref_name }} # Explicitly checkout the branch that triggered the workflow - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.x' + python-version: '3.10' - name: Install dependencies run: | python -m pip install --upgrade pip pip install nbformat nbconvert jupyter - - name: Configure Git + - name: Run conversion script + run: python .github/workflows/convert_notebooks.py + + - name: Configure Git run: | git config --global user.email "github-actions[bot]@users.noreply.github.com" git config --global user.name "github-actions[bot]" - - name: Render notebooks for GitHub - run: python .github/workflows/render_notebooks.py - - - name: Commit changes - env: - TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Check for changes + id: git-check run: | - if [[ "${{ github.event_name }}" == "pull_request" ]]; then - git fetch origin ${{ github.event.pull_request.head.ref }} - git pull --rebase origin ${{ github.event.pull_request.head.ref }} || echo "No rebase needed" - git add "**/*.ipynb" - git commit -m "Render notebooks for GitHub compatibility" || echo "No changes to commit" - git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} - git push origin HEAD:${{ github.event.pull_request.head.ref }} + if [[ -n "$(git status --porcelain **/*.ipynb)" ]]; then + echo "has_changes=true" >> $GITHUB_OUTPUT else - git add "**/*.ipynb" - git commit -m "Render notebooks for GitHub compatibility" || echo "No changes to commit" - git remote set-url origin https://x-access-token:${TOKEN}@github.com/${{ github.repository }} - git push origin ${{ github.ref_name }} + echo "has_changes=false" >> $GITHUB_OUTPUT fi + + - name: Show modified files + if: steps.git-check.outputs.has_changes == 'true' + run: git status + + - name: Commit and push changes to current branch + if: steps.git-check.outputs.has_changes == 'true' + run: | + CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) + echo "Current branch: $CURRENT_BRANCH" + + # Add, commit, and push changes + git add "**/*.ipynb" + git commit -m "Render notebooks for GitHub compatibility" + git push https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git HEAD:${CURRENT_BRANCH} From b61122f294676ad322f6599433c74eb57759fbf0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 17:21:40 +0000 Subject: [PATCH 08/22] Update visitor count --- 0_Overview.md | 2 +- AzurePortal/1_MedallionArch/README.md | 2 +- AzurePortal/1_MedallionArch/docs/README.md | 2 +- AzurePortal/2_AI_LLMs/README.md | 2 +- AzurePortal/3_AISkills.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 2 +- AzurePortal/4_CICD/1_github-integration.md | 2 +- README.md | 2 +- Terraform/README.md | 2 +- Terraform/troubleshooting.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index 004364c..f9f5edc 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index 38cfdd5..ad40444 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index 0b1d77a..d54e08b 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -15,7 +15,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 66f8921..050bbbb 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index b52cc6b..629d62f 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -99,7 +99,7 @@ Key Features:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index c98acca..d4b3276 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 1f2fe3c..8f1552e 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -14,7 +14,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index 6ec3d0f..dbc9a99 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index e163bf2..4bbc074 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index a513e3a..608dcf1 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index cfdedf9..77ae8ad 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views + Total views

Refresh Date: 2025-09-11

From 2e6a259e365a540f85853ba9ce227271c5adb021 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:00:35 -0600 Subject: [PATCH 09/22] Enhance notebook processing with logging --- .github/workflows/convert_notebooks.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/convert_notebooks.py b/.github/workflows/convert_notebooks.py index 490ffe3..b442662 100644 --- a/.github/workflows/convert_notebooks.py +++ b/.github/workflows/convert_notebooks.py @@ -15,14 +15,21 @@ def process_notebooks(directory="."): """Find and process all notebook files in the repository""" notebook_files = [] + print(f"Searching for notebooks in directory: {directory}") for root, dirs, files in os.walk(directory): + # Skip directories that should be excluded if '.git' in dirs: dirs.remove('.git') # Skip git directory if '.github' in dirs: dirs.remove('.github') # Skip GitHub directory + if '.venv' in dirs: + dirs.remove('.venv') # Skip virtual environments + for file in files: if file.endswith('.ipynb'): - notebook_files.append(os.path.join(root, file)) + notebook_path = os.path.join(root, file) + print(f"Found notebook: {notebook_path}") + notebook_files.append(notebook_path) print(f"Found {len(notebook_files)} notebooks to process") @@ -148,4 +155,7 @@ def convert_notebook(filepath): if __name__ == "__main__": print("Rendering notebooks for GitHub compatibility...") - process_notebooks() + # Get the repository root directory from environment variable if available + repo_root = os.environ.get('GITHUB_WORKSPACE', '.') + print(f"Repository root: {repo_root}") + process_notebooks(repo_root) From bae9f297b7fcfbe4d6cfadaeb869235047310a64 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:03:54 -0600 Subject: [PATCH 10/22] Improve notebook change detection in workflow --- .github/workflows/render-notebooks.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml index 228ff53..92b5965 100644 --- a/.github/workflows/render-notebooks.yml +++ b/.github/workflows/render-notebooks.yml @@ -45,10 +45,14 @@ jobs: - name: Check for changes id: git-check run: | - if [[ -n "$(git status --porcelain **/*.ipynb)" ]]; then - echo "has_changes=true" >> $GITHUB_OUTPUT - else + # More reliable way to check for changes in notebooks + git update-index --refresh + if git diff-index --quiet HEAD -- "*.ipynb"; then + echo "No changes detected in notebooks" echo "has_changes=false" >> $GITHUB_OUTPUT + else + echo "Changes detected in notebooks" + echo "has_changes=true" >> $GITHUB_OUTPUT fi - name: Show modified files From 19ab13d3a968c9f93a124375f7a955589923478c Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:05:11 -0600 Subject: [PATCH 11/22] testing --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 4bbc074..539bb76 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,6 @@ Last updated: 2025-09-11 - For this workshop, you can set up your infrastructure using either of the following approaches: 1. [Infrastructure via Azure Portal](./AzurePortal/): This approach involves creating the infrastructure and performing `all necessary steps through the Azure Portal` and its resources interface. From bebf3ec7ad75eed28a87ead445a8507334d582a7 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:17:05 -0600 Subject: [PATCH 12/22] testing render From d20fd565c87f21bc5c8b1da97e75bb1d314e9655 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:17:25 -0600 Subject: [PATCH 13/22] Delete AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb --- .../1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb deleted file mode 100644 index 04efdcf..0000000 --- a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the bronze layer:\n","df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n","\n","df_raw_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":4,"statement_ids":[4],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:11:30.8930212Z","session_start_time":null,"execution_start_time":"2024-10-25T19:14:18.2840528Z","execution_finish_time":"2024-10-25T19:14:24.7554027Z","parent_msg_id":"4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":2,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["# Clean the data (e.g., filter out rows with null values in the 'age' column):\n","df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n","print(df_cleaned)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:21:35.4162864Z","session_start_time":null,"execution_start_time":"2024-10-25T19:21:35.9099818Z","execution_finish_time":"2024-10-25T19:21:36.2079156Z","parent_msg_id":"d65f6fd9-d9ab-4498-ab5d-0710bab459be"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n"]}],"execution_count":8,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n","df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:23:52.3238132Z","session_start_time":null,"execution_start_time":"2024-10-25T19:23:52.7414203Z","execution_finish_time":"2024-10-25T19:24:09.4412514Z","parent_msg_id":"8c92d669-7856-4961-a9d0-c38d54833ee4"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":10,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Bronze layer\n","bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n","# Perform transformations (if any)\n","silver_df = bronze_df # Assuming no transformations for simplicity\n","# Write data to the Silver layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:27:20.1106109Z","session_start_time":null,"execution_start_time":"2024-10-25T19:27:20.5334249Z","execution_finish_time":"2024-10-25T19:27:25.4936309Z","parent_msg_id":"bf665ff4-43d5-4b02-90a6-6c28640576c3"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"widgets":{},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} From 133c2ca2ef27c6d44bfbb92da6344bce866b29c2 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:17:47 -0600 Subject: [PATCH 14/22] testing render restaured --- .../1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb new file mode 100644 index 0000000..04efdcf --- /dev/null +++ b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the bronze layer:\n","df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n","\n","df_raw_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":4,"statement_ids":[4],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:11:30.8930212Z","session_start_time":null,"execution_start_time":"2024-10-25T19:14:18.2840528Z","execution_finish_time":"2024-10-25T19:14:24.7554027Z","parent_msg_id":"4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":2,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["# Clean the data (e.g., filter out rows with null values in the 'age' column):\n","df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n","print(df_cleaned)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:21:35.4162864Z","session_start_time":null,"execution_start_time":"2024-10-25T19:21:35.9099818Z","execution_finish_time":"2024-10-25T19:21:36.2079156Z","parent_msg_id":"d65f6fd9-d9ab-4498-ab5d-0710bab459be"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n"]}],"execution_count":8,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n","df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:23:52.3238132Z","session_start_time":null,"execution_start_time":"2024-10-25T19:23:52.7414203Z","execution_finish_time":"2024-10-25T19:24:09.4412514Z","parent_msg_id":"8c92d669-7856-4961-a9d0-c38d54833ee4"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":10,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Bronze layer\n","bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n","# Perform transformations (if any)\n","silver_df = bronze_df # Assuming no transformations for simplicity\n","# Write data to the Silver layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:27:20.1106109Z","session_start_time":null,"execution_start_time":"2024-10-25T19:27:20.5334249Z","execution_finish_time":"2024-10-25T19:27:25.4936309Z","parent_msg_id":"bf665ff4-43d5-4b02-90a6-6c28640576c3"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"widgets":{},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} From a7200af0d64f02c70b83a9fa2c433c48d0f67337 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:18:08 -0600 Subject: [PATCH 15/22] Delete AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb --- .../1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb deleted file mode 100644 index 04efdcf..0000000 --- a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Bronze to Silver"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the bronze layer:\n","df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n","\n","df_raw_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":4,"statement_ids":[4],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:11:30.8930212Z","session_start_time":null,"execution_start_time":"2024-10-25T19:14:18.2840528Z","execution_finish_time":"2024-10-25T19:14:24.7554027Z","parent_msg_id":"4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":11,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":2,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["# Clean the data (e.g., filter out rows with null values in the 'age' column):\n","df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n","print(df_cleaned)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":10,"statement_ids":[10],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:21:35.4162864Z","session_start_time":null,"execution_start_time":"2024-10-25T19:21:35.9099818Z","execution_finish_time":"2024-10-25T19:21:36.2079156Z","parent_msg_id":"d65f6fd9-d9ab-4498-ab5d-0710bab459be"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n"]}],"execution_count":8,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n","df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":12,"statement_ids":[12],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:23:52.3238132Z","session_start_time":null,"execution_start_time":"2024-10-25T19:23:52.7414203Z","execution_finish_time":"2024-10-25T19:24:09.4412514Z","parent_msg_id":"8c92d669-7856-4961-a9d0-c38d54833ee4"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)"},"metadata":{}}],"execution_count":10,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products information"],"metadata":{"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Bronze layer\n","bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n","# Perform transformations (if any)\n","silver_df = bronze_df # Assuming no transformations for simplicity\n","# Write data to the Silver layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"ecb846c9-e988-4906-95bc-af67b3aacd94","normalized_state":"finished","queued_time":"2024-10-25T19:27:20.1106109Z","session_start_time":null,"execution_start_time":"2024-10-25T19:27:20.5334249Z","execution_finish_time":"2024-10-25T19:27:25.4936309Z","parent_msg_id":"bf665ff4-43d5-4b02-90a6-6c28640576c3"},"text/plain":"StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"widgets":{},"nteract":{"version":"nteract-front-end@1.0.0"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} From 55828068b6a50966d7f314035e3d02d3f488516f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 18:18:14 +0000 Subject: [PATCH 16/22] Update visitor count --- 0_Overview.md | 2 +- AzurePortal/1_MedallionArch/README.md | 2 +- AzurePortal/1_MedallionArch/docs/README.md | 2 +- AzurePortal/2_AI_LLMs/README.md | 2 +- AzurePortal/3_AISkills.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 2 +- AzurePortal/4_CICD/1_github-integration.md | 2 +- README.md | 2 +- Terraform/README.md | 2 +- Terraform/troubleshooting.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index f9f5edc..878d624 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index ad40444..e64a298 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index d54e08b..baf619f 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -15,7 +15,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 050bbbb..909e4b3 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index 629d62f..aaecd13 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -99,7 +99,7 @@ Key Features:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index d4b3276..058bccc 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 8f1552e..435e6e0 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -14,7 +14,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index dbc9a99..f6f3151 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index 539bb76..f72cdce 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index 608dcf1..3dff5c2 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index 77ae8ad..2d4994c 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views + Total views

Refresh Date: 2025-09-11

From 90c463de00c82b8b53e53c9583f69e86f877bd2e Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:18:38 -0600 Subject: [PATCH 17/22] render v3 --- .../src/0_notebook_bronze_to_silver.ipynb | 363 ++++++++++++++++++ 1 file changed, 363 insertions(+) create mode 100644 AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb new file mode 100644 index 0000000..d481169 --- /dev/null +++ b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb @@ -0,0 +1,363 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# PySpark Code to Move Data from Bronze to Silver" + ] + }, + { + "cell_type": "markdown", + "id": "44f47922-4e3b-45cc-81a6-c5de97634f73", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "This is an example on how to work with the medallion architecture. From Bronze to Silver" + ] + }, + { + "cell_type": "markdown", + "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Working with 2020orders information" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "febb6c3e-6841-42c1-a633-0da056b7f69c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions \n", + "from pyspark.sql import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:14:24.7554027Z", + "execution_start_time": "2024-10-25T19:14:18.2840528Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf", + "queued_time": "2024-10-25T19:11:30.8930212Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 4, + "statement_ids": [ + 4 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n", + " Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the data from the bronze layer:\n", + "df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n", + "\n", + "df_raw_2020orders.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:21:36.2079156Z", + "execution_start_time": "2024-10-25T19:21:35.9099818Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d65f6fd9-d9ab-4498-ab5d-0710bab459be", + "queued_time": "2024-10-25T19:21:35.4162864Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 10, + "statement_ids": [ + 10 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n" + ] + } + ], + "source": [ + "# Clean the data (e.g., filter out rows with null values in the 'age' column):\n", + "df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n", + "print(df_cleaned)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c83d4e46-2b49-490f-aadb-87a350c85e89", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:24:09.4412514Z", + "execution_start_time": "2024-10-25T19:23:52.7414203Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "8c92d669-7856-4961-a9d0-c38d54833ee4", + "queued_time": "2024-10-25T19:23:52.3238132Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 12, + "statement_ids": [ + 12 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n", + "df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")" + ] + }, + { + "cell_type": "markdown", + "id": "f830afb3-2b02-4076-800a-85ca9fc33fea", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Working with products information" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:27:25.4936309Z", + "execution_start_time": "2024-10-25T19:27:20.5334249Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "bf665ff4-43d5-4b02-90a6-6c28640576c3", + "queued_time": "2024-10-25T19:27:20.1106109Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 15, + "statement_ids": [ + 15 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read data from the Bronze layer\n", + "bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n", + "# Perform transformations (if any)\n", + "silver_df = bronze_df # Assuming no transformations for simplicity\n", + "# Write data to the Silver layer\n", + "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")" + ] + } + ], + "metadata": { + "dependencies": { + "lakehouse": { + "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", + "default_lakehouse_name": "raw_Bronze", + "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" + } + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "widgets": {} + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 730db06656e14105beadcaafdf781de61b585c6a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 18:18:55 +0000 Subject: [PATCH 18/22] Update visitor count --- 0_Overview.md | 2 +- AzurePortal/1_MedallionArch/README.md | 2 +- AzurePortal/1_MedallionArch/docs/README.md | 2 +- AzurePortal/2_AI_LLMs/README.md | 2 +- AzurePortal/3_AISkills.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 2 +- AzurePortal/4_CICD/1_github-integration.md | 2 +- README.md | 2 +- Terraform/README.md | 2 +- Terraform/troubleshooting.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index 878d624..f9f5edc 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index e64a298..ad40444 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index baf619f..d54e08b 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -15,7 +15,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 909e4b3..050bbbb 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index aaecd13..629d62f 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -99,7 +99,7 @@ Key Features:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index 058bccc..d4b3276 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 435e6e0..8f1552e 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -14,7 +14,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index f6f3151..dbc9a99 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index f72cdce..539bb76 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index 3dff5c2..608dcf1 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index 2d4994c..77ae8ad 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views + Total views

Refresh Date: 2025-09-11

From 9746c48b936f235e65c207dfc23507bb312b81d3 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:22:45 -0600 Subject: [PATCH 19/22] Enhance notebook converter for GitHub compatibility --- .github/workflows/convert_notebooks.py | 48 ++++++++++++++------------ 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/.github/workflows/convert_notebooks.py b/.github/workflows/convert_notebooks.py index b442662..b6ee9dd 100644 --- a/.github/workflows/convert_notebooks.py +++ b/.github/workflows/convert_notebooks.py @@ -2,8 +2,9 @@ """ Notebook to GitHub-Compatible Format Converter -This script renders XML-format notebooks to standard Jupyter JSON format -with the required widget state metadata for GitHub rendering. +This script fixes Jupyter notebooks for GitHub rendering by: +1. Converting XML-format notebooks to standard Jupyter JSON format +2. Cleaning widget metadata that can cause GitHub rendering issues """ import os @@ -42,7 +43,7 @@ def process_notebooks(directory="."): return success_count def convert_notebook(filepath): - """Convert a XML notebook to standard Jupyter JSON format""" + """Convert a notebook to GitHub-compatible format by cleaning widget metadata""" print(f"\nProcessing {filepath}") try: @@ -99,6 +100,7 @@ def convert_notebook(filepath): "pygments_lexer": "ipython3", "version": "3.8.10" }, + # Add empty widget state to prevent GitHub rendering issues "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, @@ -117,34 +119,34 @@ def convert_notebook(filepath): return True else: - # It's already in JSON format, check if it has widget state + # It's already in JSON format, clean widget metadata try: - nb_dict = json.loads(content) + notebook = json.loads(content) + print(f" Cleaning widget metadata...") - # Check if we need to add widget state metadata - if "widgets" not in nb_dict.get("metadata", {}): - print(f" Adding widget state metadata to JSON notebook...") - nb = nbformat.reads(content, as_version=4) - if "metadata" not in nb: - nb.metadata = {} - nb.metadata["widgets"] = { + # Remove potentially problematic widget state but keep proper structure + if 'metadata' in notebook: + # Replace with clean widget state + notebook['metadata']['widgets'] = { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } - - # Validate and write the notebook - validate(nb) - with open(filepath, 'w', encoding='utf-8') as f: - nbformat.write(nb, f) - - print(f" Successfully added widget state to {filepath}") - return True - else: - print(f" Notebook already in correct format for GitHub, no changes needed") - return True + + # Clean widget metadata from cells as well + for cell in notebook.get('cells', []): + if 'metadata' in cell and 'widgets' in cell['metadata']: + del cell['metadata']['widgets'] + + # Write the cleaned notebook + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(notebook, f, indent=2) + + print(f" Successfully cleaned {filepath} for GitHub compatibility") + return True + except json.JSONDecodeError: print(f" ERROR: {filepath} is not in valid JSON format or XML format") return False From 7b7fcda6b889518fc9be6e58fc108d2726a0ff9e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 18:23:06 +0000 Subject: [PATCH 20/22] Update visitor count --- 0_Overview.md | 2 +- AzurePortal/1_MedallionArch/README.md | 2 +- AzurePortal/1_MedallionArch/docs/README.md | 2 +- AzurePortal/2_AI_LLMs/README.md | 2 +- AzurePortal/3_AISkills.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/README.md | 2 +- AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md | 2 +- AzurePortal/4_CICD/1_github-integration.md | 2 +- README.md | 2 +- Terraform/README.md | 2 +- Terraform/troubleshooting.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/0_Overview.md b/0_Overview.md index f9f5edc..878d624 100644 --- a/0_Overview.md +++ b/0_Overview.md @@ -104,7 +104,7 @@ Click here for more information about: [Z-Order & V-Order](https://github.com/Mi
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/README.md b/AzurePortal/1_MedallionArch/README.md index ad40444..e64a298 100644 --- a/AzurePortal/1_MedallionArch/README.md +++ b/AzurePortal/1_MedallionArch/README.md @@ -299,7 +299,7 @@ https://github.com/user-attachments/assets/2a64762a-f120-4448-b0fb-7a49f4d1bedb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/1_MedallionArch/docs/README.md b/AzurePortal/1_MedallionArch/docs/README.md index d54e08b..baf619f 100644 --- a/AzurePortal/1_MedallionArch/docs/README.md +++ b/AzurePortal/1_MedallionArch/docs/README.md @@ -15,7 +15,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/2_AI_LLMs/README.md b/AzurePortal/2_AI_LLMs/README.md index 050bbbb..909e4b3 100644 --- a/AzurePortal/2_AI_LLMs/README.md +++ b/AzurePortal/2_AI_LLMs/README.md @@ -397,7 +397,7 @@ Make sure to replace `"your_openai_api_key"`, `"https://your_openai_api_base/"`,
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/3_AISkills.md b/AzurePortal/3_AISkills.md index 629d62f..aaecd13 100644 --- a/AzurePortal/3_AISkills.md +++ b/AzurePortal/3_AISkills.md @@ -99,7 +99,7 @@ Key Features:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/README.md index d4b3276..058bccc 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/README.md @@ -196,7 +196,7 @@ Steps to Set Up Incremental Refresh:
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md index 8f1552e..435e6e0 100644 --- a/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md +++ b/AzurePortal/4_CICD/0_deployment-pipelines/samples/README.md @@ -14,7 +14,7 @@ Last updated: 2025-09-11
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/AzurePortal/4_CICD/1_github-integration.md b/AzurePortal/4_CICD/1_github-integration.md index dbc9a99..f6f3151 100644 --- a/AzurePortal/4_CICD/1_github-integration.md +++ b/AzurePortal/4_CICD/1_github-integration.md @@ -64,7 +64,7 @@ https://github.com/user-attachments/assets/64f099a1-b749-47a6-b723-fa1cb5c575a3
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/README.md b/README.md index 539bb76..f72cdce 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ This is an introductory workshop on Microsoft Fabric. Please follow as described
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/README.md b/Terraform/README.md index 608dcf1..3dff5c2 100644 --- a/Terraform/README.md +++ b/Terraform/README.md @@ -143,7 +143,7 @@ https://github.com/user-attachments/assets/1ab31707-6f4c-4ec7-9e92-5d5cc96ac5bb
- Total views + Total views

Refresh Date: 2025-09-11

diff --git a/Terraform/troubleshooting.md b/Terraform/troubleshooting.md index 77ae8ad..2d4994c 100644 --- a/Terraform/troubleshooting.md +++ b/Terraform/troubleshooting.md @@ -176,7 +176,7 @@ Error: Failed to get existing workspaces: Error retrieving keys for Storage Acco
- Total views + Total views

Refresh Date: 2025-09-11

From 0cdaee2e2a43cdf56053d18185d2d6e1a8b62ce6 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:25:00 -0600 Subject: [PATCH 21/22] Refactor notebook rendering workflow for efficiency --- .github/workflows/render-notebooks.yml | 34 +++++++++----------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/.github/workflows/render-notebooks.yml b/.github/workflows/render-notebooks.yml index 92b5965..0577b6a 100644 --- a/.github/workflows/render-notebooks.yml +++ b/.github/workflows/render-notebooks.yml @@ -42,30 +42,20 @@ jobs: git config --global user.email "github-actions[bot]@users.noreply.github.com" git config --global user.name "github-actions[bot]" - - name: Check for changes - id: git-check - run: | - # More reliable way to check for changes in notebooks - git update-index --refresh - if git diff-index --quiet HEAD -- "*.ipynb"; then - echo "No changes detected in notebooks" - echo "has_changes=false" >> $GITHUB_OUTPUT - else - echo "Changes detected in notebooks" - echo "has_changes=true" >> $GITHUB_OUTPUT - fi - - - name: Show modified files - if: steps.git-check.outputs.has_changes == 'true' - run: git status - - - name: Commit and push changes to current branch - if: steps.git-check.outputs.has_changes == 'true' + - name: Commit and push changes run: | CURRENT_BRANCH=$(git rev-parse --abbrev-ref HEAD) echo "Current branch: $CURRENT_BRANCH" - # Add, commit, and push changes + # Stage all notebook files git add "**/*.ipynb" - git commit -m "Render notebooks for GitHub compatibility" - git push https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git HEAD:${CURRENT_BRANCH} + + # Check if there are changes to commit + if git diff --staged --quiet; then + echo "No changes detected in notebooks" + else + echo "Changes detected in notebooks" + git commit -m "Fix notebooks for GitHub compatibility" + git push https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git HEAD:${CURRENT_BRANCH} + echo "Successfully pushed changes" + fi From 5cfed163670d39ab5b370d06a8f366de1b3592bd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 11 Sep 2025 18:25:34 +0000 Subject: [PATCH 22/22] Fix notebooks for GitHub compatibility --- .../src/0_notebook_bronze_to_silver.ipynb | 674 ++--- .../src/1_notebook_silver_to_gold.ipynb | 488 +++- .../src/fabric-llms-overview_sample.ipynb | 2405 +++++++++-------- 3 files changed, 2030 insertions(+), 1537 deletions(-) diff --git a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb index d481169..03ebd49 100644 --- a/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb +++ b/AzurePortal/1_MedallionArch/src/0_notebook_bronze_to_silver.ipynb @@ -1,363 +1,369 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# PySpark Code to Move Data from Bronze to Silver" - ] - }, - { - "cell_type": "markdown", - "id": "44f47922-4e3b-45cc-81a6-c5de97634f73", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "This is an example on how to work with the medallion architecture. From Bronze to Silver" - ] - }, - { - "cell_type": "markdown", - "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Working with 2020orders information" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "febb6c3e-6841-42c1-a633-0da056b7f69c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - } - }, - "outputs": [], - "source": [ - "from pyspark.sql.types import *\n", - "import pyspark.sql.functions \n", - "from pyspark.sql import *" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "cells": [ + { + "cell_type": "markdown", + "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# PySpark Code to Move Data from Bronze to Silver" + ] }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" + { + "cell_type": "markdown", + "id": "44f47922-4e3b-45cc-81a6-c5de97634f73", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "This is an example on how to work with the medallion architecture. From Bronze to Silver" + ] }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-25T19:14:24.7554027Z", - "execution_start_time": "2024-10-25T19:14:18.2840528Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf", - "queued_time": "2024-10-25T19:11:30.8930212Z", - "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 4, - "statement_ids": [ - 4 - ] + "cell_type": "markdown", + "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } }, - "text/plain": [ - "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)" + "source": [ + "## Working with 2020orders information" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "data": { - "text/plain": [ - "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n", - " Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + "cell_type": "code", + "execution_count": null, + "id": "febb6c3e-6841-42c1-a633-0da056b7f69c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions \n", + "from pyspark.sql import *" ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Read the data from the bronze layer:\n", - "df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n", - "\n", - "df_raw_2020orders.head(2)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" + { + "cell_type": "code", + "execution_count": 2, + "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:14:24.7554027Z", + "execution_start_time": "2024-10-25T19:14:18.2840528Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "4d60fa03-18dd-4bbc-adab-cb12ff2ba6cf", + "queued_time": "2024-10-25T19:11:30.8930212Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 4, + "statement_ids": [ + 4 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 4, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n", + " Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Read the data from the bronze layer:\n", + "df_raw_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/2020orders\")\n", + "\n", + "df_raw_2020orders.head(2)" + ] }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-25T19:21:36.2079156Z", - "execution_start_time": "2024-10-25T19:21:35.9099818Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "d65f6fd9-d9ab-4498-ab5d-0710bab459be", - "queued_time": "2024-10-25T19:21:35.4162864Z", - "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 10, - "statement_ids": [ - 10 - ] + "cell_type": "code", + "execution_count": 8, + "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } }, - "text/plain": [ - "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)" + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:21:36.2079156Z", + "execution_start_time": "2024-10-25T19:21:35.9099818Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d65f6fd9-d9ab-4498-ab5d-0710bab459be", + "queued_time": "2024-10-25T19:21:35.4162864Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 10, + "statement_ids": [ + 10 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 10, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n" + ] + } + ], + "source": [ + "# Clean the data (e.g., filter out rows with null values in the 'age' column):\n", + "df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n", + "print(df_cleaned)" ] - }, - "metadata": {}, - "output_type": "display_data" }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "DataFrame[ID: string, Count: int, Date: string, Name: string, Style: string, price: double, tax: double]\n" - ] - } - ], - "source": [ - "# Clean the data (e.g., filter out rows with null values in the 'age' column):\n", - "df_cleaned = df_raw_2020orders.filter(df_raw_2020orders[\"Date\"].isNotNull())\n", - "print(df_cleaned)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "c83d4e46-2b49-490f-aadb-87a350c85e89", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "cell_type": "code", + "execution_count": 10, + "id": "c83d4e46-2b49-490f-aadb-87a350c85e89", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:24:09.4412514Z", + "execution_start_time": "2024-10-25T19:23:52.7414203Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "8c92d669-7856-4961-a9d0-c38d54833ee4", + "queued_time": "2024-10-25T19:23:52.3238132Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 12, + "statement_ids": [ + 12 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n", + "df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")" + ] }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" + { + "cell_type": "markdown", + "id": "f830afb3-2b02-4076-800a-85ca9fc33fea", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Working with products information" + ] }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-25T19:24:09.4412514Z", - "execution_start_time": "2024-10-25T19:23:52.7414203Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "8c92d669-7856-4961-a9d0-c38d54833ee4", - "queued_time": "2024-10-25T19:23:52.3238132Z", - "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 12, - "statement_ids": [ - 12 - ] + "cell_type": "code", + "execution_count": 13, + "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } }, - "text/plain": [ - "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 12, Finished, Available, Finished)" + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-25T19:27:25.4936309Z", + "execution_start_time": "2024-10-25T19:27:20.5334249Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "bf665ff4-43d5-4b02-90a6-6c28640576c3", + "queued_time": "2024-10-25T19:27:20.1106109Z", + "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 15, + "statement_ids": [ + 15 + ] + }, + "text/plain": [ + "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Read data from the Bronze layer\n", + "bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n", + "# Perform transformations (if any)\n", + "silver_df = bronze_df # Assuming no transformations for simplicity\n", + "# Write data to the Silver layer\n", + "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")" ] - }, - "metadata": {}, - "output_type": "display_data" } - ], - "source": [ - "# Save the cleaned data to the \"cleansed_Silver\" table in the Silver lakehouse:\n", - "df_cleaned.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")" - ] - }, - { - "cell_type": "markdown", - "id": "f830afb3-2b02-4076-800a-85ca9fc33fea", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" + ], + "metadata": { + "dependencies": { + "lakehouse": { + "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", + "default_lakehouse_name": "raw_Bronze", + "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" + } }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Working with products information" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" }, "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } }, "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-25T19:27:25.4936309Z", - "execution_start_time": "2024-10-25T19:27:20.5334249Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "bf665ff4-43d5-4b02-90a6-6c28640576c3", - "queued_time": "2024-10-25T19:27:20.1106109Z", - "session_id": "ecb846c9-e988-4906-95bc-af67b3aacd94", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 15, - "statement_ids": [ - 15 - ] - }, - "text/plain": [ - "StatementMeta(, ecb846c9-e988-4906-95bc-af67b3aacd94, 15, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Read data from the Bronze layer\n", - "bronze_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/raw_Bronze.Lakehouse/Tables/products\")\n", - "# Perform transformations (if any)\n", - "silver_df = bronze_df # Assuming no transformations for simplicity\n", - "# Write data to the Silver layer\n", - "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")" - ] - } - ], - "metadata": { - "dependencies": { - "lakehouse": { - "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", - "default_lakehouse_name": "raw_Bronze", - "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" - } - }, - "kernel_info": { - "name": "synapse_pyspark" - }, - "kernelspec": { - "display_name": "Synapse PySpark", - "language": "Python", - "name": "synapse_pyspark" - }, - "language_info": { - "name": "python" - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark", - "ms_spell_check": { - "ms_spell_check_language": "en" - } - }, - "nteract": { - "version": "nteract-front-end@1.0.0" - }, - "spark_compute": { - "compute_id": "/trident/default", - "session_options": { - "conf": { - "spark.synapse.nbs.session.timeout": "1200000" + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } } - } }, - "widgets": {} - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb b/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb index 42bf848..d574aae 100644 --- a/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb +++ b/AzurePortal/1_MedallionArch/src/1_notebook_silver_to_gold.ipynb @@ -1 +1,487 @@ -{"cells":[{"cell_type":"markdown","source":["# PySpark Code to Move Data from Silver to Gold"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13"},{"cell_type":"markdown","source":["This is an example on how to work with the medallion architecture. From Silver to Gold"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"44f47922-4e3b-45cc-81a6-c5de97634f73"},{"cell_type":"markdown","source":["## Working with 2020orders_silver information"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"4e1afae8-b2ef-4e4c-9ac9-485139f19e9c"},{"cell_type":"code","source":["from pyspark.sql.types import *\n","import pyspark.sql.functions \n","from pyspark.sql import *\n","from pyspark.sql.functions import sum\n"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":15,"statement_ids":[15],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:16.4970957Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:16.9788868Z","execution_finish_time":"2024-10-25T20:29:17.2577679Z","parent_msg_id":"f6330200-43db-4e0b-9e85-773c1fa95042"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 15, Finished, Available, Finished)"},"metadata":{}}],"execution_count":13,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"febb6c3e-6841-42c1-a633-0da056b7f69c"},{"cell_type":"code","source":["# Read the data from the silver layer:\n","df_cleansed_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")\n","\n","df_cleansed_2020orders.head(2)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":16,"statement_ids":[16],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:17.7525504Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:18.2199944Z","execution_finish_time":"2024-10-25T20:29:26.3332937Z","parent_msg_id":"901386c6-fa37-4dad-8a8d-34e1276108f8"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 16, Finished, Available, Finished)"},"metadata":{}},{"output_type":"execute_result","execution_count":47,"data":{"text/plain":"[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]"},"metadata":{}}],"execution_count":14,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"6357dc47-1bb1-4391-8f38-b5d5a2abf5b2"},{"cell_type":"code","source":["df_cleansed_2020orders = df_cleansed_2020orders.withColumn(\"tax\", df_cleansed_2020orders[\"tax\"].cast(\"int\")) # type to int"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":17,"statement_ids":[17],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.1174684Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:26.7221718Z","execution_finish_time":"2024-10-25T20:29:26.9553583Z","parent_msg_id":"1d52d464-b1ce-40ca-a8b4-ed446b5980fd"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 17, Finished, Available, Finished)"},"metadata":{}}],"execution_count":15,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"b1478c38-d56e-4a58-a551-405675f4110d"},{"cell_type":"code","source":["df_cleansed_2020orders.printSchema()"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":18,"statement_ids":[18],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.3209761Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:27.3402865Z","execution_finish_time":"2024-10-25T20:29:27.5846334Z","parent_msg_id":"882470dc-dab6-4bef-bf92-1f25c81c4bad"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 18, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["root\n |-- ID: string (nullable = true)\n |-- Count: integer (nullable = true)\n |-- Date: string (nullable = true)\n |-- Name: string (nullable = true)\n |-- Style: string (nullable = true)\n |-- price: double (nullable = true)\n |-- tax: integer (nullable = true)\n\n"]}],"execution_count":16,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"f9da346b-2b25-463a-821d-6b8d7bda321e"},{"cell_type":"code","source":["# Group and Aggregate the Data:\n","df_aggregated = df_cleansed_2020orders.groupBy(\"Style\").agg(sum(\"price\").alias(\"total_price_vehicles\"))\n","df_aggregated.show(10, truncate=False)\n","print(df_aggregated)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":19,"statement_ids":[19],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.5521281Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:27.9679539Z","execution_finish_time":"2024-10-25T20:29:34.3035094Z","parent_msg_id":"9a5b30d2-5552-495b-b0a7-c5cfed0bebd8"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 19, Finished, Available, Finished)"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["+-----------------------+--------------------+\n|Style |total_price_vehicles|\n+-----------------------+--------------------+\n|Mountain-200 Black, 42 |196713.42720000003 |\n|Mountain-100 Silver, 42|71399.78999999998 |\n|Mountain-200 Silver, 42|159499.30919999976 |\n|Mountain-100 Silver, 44|71399.78999999998 |\n|Road-550-W Yellow, 40 |37016.1875 |\n|Mountain-100 Silver, 38|64599.80999999997 |\n|Road-250 Red, 48 |256551.75000000044 |\n|Road-250 Red, 52 |217458.15000000034 |\n|Road-650 Red, 52 |20301.81200000001 |\n|Road-250 Black, 52 |253061.25 |\n+-----------------------+--------------------+\nonly showing top 10 rows\n\nDataFrame[Style: string, total_price_vehicles: double]\n"]}],"execution_count":17,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"5f5c8125-cbf5-4e00-9d8f-0c437f25b37f"},{"cell_type":"code","source":["# Save the cleaned data to the \"curated_Gold\" table in the Gold lakehouse:\n","df_aggregated.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_Gold.Lakehouse/Tables/2020orders_gold\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":20,"statement_ids":[20],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:18.7090964Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:34.7050563Z","execution_finish_time":"2024-10-25T20:29:39.5141044Z","parent_msg_id":"56e18a6c-9106-4200-a6cc-7386c03b93d3"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 20, Finished, Available, Finished)"},"metadata":{}}],"execution_count":18,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"c83d4e46-2b49-490f-aadb-87a350c85e89"},{"cell_type":"markdown","source":["## Working with products_silver information"],"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"f830afb3-2b02-4076-800a-85ca9fc33fea"},{"cell_type":"code","source":["# Read data from the Silver layer\n","silver_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")\n","# Perform transformations (if any)\n","silver_df = silver_df # Assuming no transformations for simplicity\n","# Write data to the Gold layer\n","silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_gold.Lakehouse/Tables/products_silver\")"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":21,"statement_ids":[21],"state":"finished","livy_statement_state":"available","session_id":"8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5","normalized_state":"finished","queued_time":"2024-10-25T20:29:45.1788369Z","session_start_time":null,"execution_start_time":"2024-10-25T20:29:45.5790407Z","execution_finish_time":"2024-10-25T20:29:49.090114Z","parent_msg_id":"67830cc7-dbf9-4b47-a9ea-51ce1c29634b"},"text/plain":"StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 21, Finished, Available, Finished)"},"metadata":{}}],"execution_count":19,"metadata":{"jupyter":{"outputs_hidden":false,"source_hidden":false},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"nteract":{"transient":{"deleting":false}}},"id":"7f72ac98-4ece-4a8a-a5c5-5e1fc7273382"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"nteract-front-end@1.0.0"},"widgets":{},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{"lakehouse":{"default_lakehouse":"251cd515-16a3-4555-a3d2-dfd12adb2335","default_lakehouse_name":"raw_Bronze","default_lakehouse_workspace_id":"597e0afc-c8db-4f4d-8464-d13570f5b075"}}},"nbformat":4,"nbformat_minor":5} +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# PySpark Code to Move Data from Silver to Gold" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "ab7f0b7c-b0cc-44ec-9948-61d68f4b0b13" + }, + { + "cell_type": "markdown", + "source": [ + "This is an example on how to work with the medallion architecture. From Silver to Gold" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "44f47922-4e3b-45cc-81a6-c5de97634f73" + }, + { + "cell_type": "markdown", + "source": [ + "## Working with 2020orders_silver information" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "4e1afae8-b2ef-4e4c-9ac9-485139f19e9c" + }, + { + "cell_type": "code", + "source": [ + "from pyspark.sql.types import *\n", + "import pyspark.sql.functions \n", + "from pyspark.sql import *\n", + "from pyspark.sql.functions import sum\n" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 15, + "statement_ids": [ + 15 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:16.4970957Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:16.9788868Z", + "execution_finish_time": "2024-10-25T20:29:17.2577679Z", + "parent_msg_id": "f6330200-43db-4e0b-9e85-773c1fa95042" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 15, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 13, + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "febb6c3e-6841-42c1-a633-0da056b7f69c" + }, + { + "cell_type": "code", + "source": [ + "# Read the data from the silver layer:\n", + "df_cleansed_2020orders = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/2020orders_silver\")\n", + "\n", + "df_cleansed_2020orders.head(2)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 16, + "statement_ids": [ + 16 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:17.7525504Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:18.2199944Z", + "execution_finish_time": "2024-10-25T20:29:26.3332937Z", + "parent_msg_id": "901386c6-fa37-4dad-8a8d-34e1276108f8" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 16, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "execution_count": 47, + "data": { + "text/plain": "[Row(ID='SO45376', Count=1, Date='1/5/2020', Name='Edgar Mehta', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992),\n Row(ID='SO45381', Count=1, Date='1/6/2020', Name='Jordan Long', Style='Mountain-100 Silver, 38', price=3399.99, tax=271.9992)]" + }, + "metadata": {} + } + ], + "execution_count": 14, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "6357dc47-1bb1-4391-8f38-b5d5a2abf5b2" + }, + { + "cell_type": "code", + "source": [ + "df_cleansed_2020orders = df_cleansed_2020orders.withColumn(\"tax\", df_cleansed_2020orders[\"tax\"].cast(\"int\")) # type to int" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 17, + "statement_ids": [ + 17 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.1174684Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:26.7221718Z", + "execution_finish_time": "2024-10-25T20:29:26.9553583Z", + "parent_msg_id": "1d52d464-b1ce-40ca-a8b4-ed446b5980fd" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 17, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 15, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "b1478c38-d56e-4a58-a551-405675f4110d" + }, + { + "cell_type": "code", + "source": [ + "df_cleansed_2020orders.printSchema()" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 18, + "statement_ids": [ + 18 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.3209761Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:27.3402865Z", + "execution_finish_time": "2024-10-25T20:29:27.5846334Z", + "parent_msg_id": "882470dc-dab6-4bef-bf92-1f25c81c4bad" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 18, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "root\n |-- ID: string (nullable = true)\n |-- Count: integer (nullable = true)\n |-- Date: string (nullable = true)\n |-- Name: string (nullable = true)\n |-- Style: string (nullable = true)\n |-- price: double (nullable = true)\n |-- tax: integer (nullable = true)\n\n" + ] + } + ], + "execution_count": 16, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "id": "f9da346b-2b25-463a-821d-6b8d7bda321e" + }, + { + "cell_type": "code", + "source": [ + "# Group and Aggregate the Data:\n", + "df_aggregated = df_cleansed_2020orders.groupBy(\"Style\").agg(sum(\"price\").alias(\"total_price_vehicles\"))\n", + "df_aggregated.show(10, truncate=False)\n", + "print(df_aggregated)" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 19, + "statement_ids": [ + 19 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.5521281Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:27.9679539Z", + "execution_finish_time": "2024-10-25T20:29:34.3035094Z", + "parent_msg_id": "9a5b30d2-5552-495b-b0a7-c5cfed0bebd8" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 19, Finished, Available, Finished)" + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+-----------------------+--------------------+\n|Style |total_price_vehicles|\n+-----------------------+--------------------+\n|Mountain-200 Black, 42 |196713.42720000003 |\n|Mountain-100 Silver, 42|71399.78999999998 |\n|Mountain-200 Silver, 42|159499.30919999976 |\n|Mountain-100 Silver, 44|71399.78999999998 |\n|Road-550-W Yellow, 40 |37016.1875 |\n|Mountain-100 Silver, 38|64599.80999999997 |\n|Road-250 Red, 48 |256551.75000000044 |\n|Road-250 Red, 52 |217458.15000000034 |\n|Road-650 Red, 52 |20301.81200000001 |\n|Road-250 Black, 52 |253061.25 |\n+-----------------------+--------------------+\nonly showing top 10 rows\n\nDataFrame[Style: string, total_price_vehicles: double]\n" + ] + } + ], + "execution_count": 17, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "5f5c8125-cbf5-4e00-9d8f-0c437f25b37f" + }, + { + "cell_type": "code", + "source": [ + "# Save the cleaned data to the \"curated_Gold\" table in the Gold lakehouse:\n", + "df_aggregated.write.format(\"delta\").mode(\"overwrite\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_Gold.Lakehouse/Tables/2020orders_gold\")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 20, + "statement_ids": [ + 20 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:18.7090964Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:34.7050563Z", + "execution_finish_time": "2024-10-25T20:29:39.5141044Z", + "parent_msg_id": "56e18a6c-9106-4200-a6cc-7386c03b93d3" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 20, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 18, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "c83d4e46-2b49-490f-aadb-87a350c85e89" + }, + { + "cell_type": "markdown", + "source": [ + "## Working with products_silver information" + ], + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "f830afb3-2b02-4076-800a-85ca9fc33fea" + }, + { + "cell_type": "code", + "source": [ + "# Read data from the Silver layer\n", + "silver_df = spark.read.format(\"delta\").load(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/cleansed_test_Silver.Lakehouse/Tables/products_silver\")\n", + "# Perform transformations (if any)\n", + "silver_df = silver_df # Assuming no transformations for simplicity\n", + "# Write data to the Gold layer\n", + "silver_df.write.mode(\"overwrite\").format(\"delta\").save(\"abfss://fabric_medallion_arch_demo@onelake.dfs.fabric.microsoft.com/curated_gold.Lakehouse/Tables/products_silver\")" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.livy.statement-meta+json": { + "spark_pool": null, + "statement_id": 21, + "statement_ids": [ + 21 + ], + "state": "finished", + "livy_statement_state": "available", + "session_id": "8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5", + "normalized_state": "finished", + "queued_time": "2024-10-25T20:29:45.1788369Z", + "session_start_time": null, + "execution_start_time": "2024-10-25T20:29:45.5790407Z", + "execution_finish_time": "2024-10-25T20:29:49.090114Z", + "parent_msg_id": "67830cc7-dbf9-4b47-a9ea-51ce1c29634b" + }, + "text/plain": "StatementMeta(, 8c75f6dc-9a98-446c-957a-b0fa1e9d3fa5, 21, Finished, Available, Finished)" + }, + "metadata": {} + } + ], + "execution_count": 19, + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "id": "7f72ac98-4ece-4a8a-a5c5-5e1fc7273382" + } + ], + "metadata": { + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "name": "synapse_pyspark", + "language": "Python", + "display_name": "Synapse PySpark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "dependencies": { + "lakehouse": { + "default_lakehouse": "251cd515-16a3-4555-a3d2-dfd12adb2335", + "default_lakehouse_name": "raw_Bronze", + "default_lakehouse_workspace_id": "597e0afc-c8db-4f4d-8464-d13570f5b075" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb b/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb index 880187e..a0d1b8e 100644 --- a/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb +++ b/AzurePortal/2_AI_LLMs/src/fabric-llms-overview_sample.ipynb @@ -1,1202 +1,1203 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "519955e9-2dad-456d-93db-a332d38e9433", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "# Fabric: Highlights into AI/LLMs" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d312e8d9-03fe-4b3d-aa6d-c52e3022ae39", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:26.7170509Z", - "execution_start_time": "2024-10-31T03:58:19.270951Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e267b6ab-5133-4598-8251-d64374cd11e5", - "queued_time": "2024-10-31T03:58:18.9132075Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 5, - "statement_ids": [ - 5 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 5, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: synapseml\r\n", - "Version: 1.0.8\r\n", - "Summary: Synapse Machine Learning\r\n", - "Home-page: https://github.com/Microsoft/SynapseML\r\n", - "Author: Microsoft\r\n", - "Author-email: synapseml-support@microsoft.com\r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: \r\n", - "Required-by: \r\n" - ] - } - ], - "source": [ - "!pip show synapseml" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "427610d0-3fae-45e3-8150-92ee7674f44c", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:28.6254349Z", - "execution_start_time": "2024-10-31T03:58:27.1124616Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "0e9f6c0f-062b-4e5d-9061-afcd89c8fd75", - "queued_time": "2024-10-31T03:58:19.3223486Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 6, - "statement_ids": [ - 6 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 6, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: langchain-openai\r\n", - "Version: 0.2.4\r\n", - "Summary: An integration package connecting OpenAI and LangChain\r\n", - "Home-page: https://github.com/langchain-ai/langchain\r\n", - "Author: \r\n", - "Author-email: \r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: langchain-core, openai, tiktoken\r\n", - "Required-by: \r\n" - ] - } - ], - "source": [ - "!pip show langchain-openai" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "baeeb853-2104-4edf-abf4-4d4be50cb977", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T03:58:30.5465258Z", - "execution_start_time": "2024-10-31T03:58:29.0000586Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "716d9975-263b-4d92-b25c-b342106f5f43", - "queued_time": "2024-10-31T03:58:19.511824Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 7, - "statement_ids": [ - 7 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 7, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: langchain\r\n", - "Version: 0.3.6\r\n", - "Summary: Building applications with LLMs through composability\r\n", - "Home-page: https://github.com/langchain-ai/langchain\r\n", - "Author: \r\n", - "Author-email: \r\n", - "License: MIT\r\n", - "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", - "Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity\r\n", - "Required-by: langchain-community\r\n" - ] - } - ], - "source": [ - "!pip show langchain" - ] - }, - { - "cell_type": "markdown", - "id": "c58cc406-c4f5-4607-a740-0802e8e4b550", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Ensure you have the API key and endpoint URL for your deployed model. Set these as environment variables" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "3c8ada7c-2632-4c69-86d2-f5260ee8f1b7", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:14.3495341Z", - "execution_start_time": "2024-10-31T04:20:14.1128215Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "2573bf75-fe6d-40dc-b9f6-e06ebb9f7f73", - "queued_time": "2024-10-31T04:20:13.6194485Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 22, - "statement_ids": [ - 22 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 22, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_VERSION\"] = \"2023-08-01-preview\"\n", - "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://your-resource.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview\"\n", - "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"your-value\"" - ] - }, - { - "cell_type": "markdown", - "id": "3fac48a9-45fb-4e86-9792-8ee340b0ac60", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Create an instance of the Azure OpenAI class using the environment variables set above" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "5db10350-8000-4cbd-9bdf-d7da62d7fe61", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:14.9382032Z", - "execution_start_time": "2024-10-31T04:20:14.7083469Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "7dfaca5a-f738-4010-bba1-f764ea70f450", - "queued_time": "2024-10-31T04:20:14.027325Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 23, - "statement_ids": [ - 23 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 23, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain_openai import AzureChatOpenAI\n", - "\n", - "# Set the API base URL\n", - "api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", - "\n", - "# Create an instance of the Azure OpenAI Class\n", - "llm = AzureChatOpenAI(\n", - " openai_api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", - " temperature=0.7,\n", - " verbose=True,\n", - " top_p=0.9\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b17d7450-34b5-4ece-8e20-a77ddcdd93c4", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use the Azure OpenAI service to generate text or perform other language model tasks" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "cfc5fd62-085a-4eff-9192-696d9f249a8e", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:16.0500538Z", - "execution_start_time": "2024-10-31T04:20:15.2936074Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "e14e4d0b-1fd0-4dac-a07d-6479d6536ce3", - "queued_time": "2024-10-31T04:20:14.4969185Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 24, - "statement_ids": [ - 24 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 24, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "content='Salut, comment ça va ?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 33, 'total_tokens': 39, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_d54531d9eb', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}} id='run-8cb7f29a-44c1-4f65-a648-15afb2d793dc-0' usage_metadata={'input_tokens': 33, 'output_tokens': 6, 'total_tokens': 39, 'input_token_details': {}, 'output_token_details': {}}\n" - ] - } - ], - "source": [ - "# Define a prompt\n", - "messages = [\n", - " (\n", - " \"system\",\n", - " \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n", - " ),\n", - " (\"human\", \"Hi, how are you?\"),\n", - "]\n", - "\n", - "# Generate a response from the Azure OpenAI service using the invoke method\n", - "ai_msg = llm.invoke(messages)\n", - "\n", - "# Print the response\n", - "print(ai_msg)" - ] - }, - { - "cell_type": "markdown", - "id": "79729106-c7f1-4879-bc2b-871b50c2ac9a", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Define a prompt template for generating definitions" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "ca633361-c27b-4294-b8a7-9fc4a316afa4", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:16.587491Z", - "execution_start_time": "2024-10-31T04:20:16.3655978Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "cc3215f4-71a5-4231-af47-9bd9a8f5698a", - "queued_time": "2024-10-31T04:20:14.7799392Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 25, - "statement_ids": [ - 25 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 25, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.prompts import PromptTemplate\n", - "\n", - "copy_prompt = PromptTemplate(\n", - " input_variables=[\"technology\"],\n", - " template=\"Define the following word: {technology}\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "899839d9-adca-4042-b662-73edcad7e432", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Create an LLMChain with the defined prompt template" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "bd4f65ca-049b-481d-bbbd-a017c6c0119b", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:17.1233668Z", - "execution_start_time": "2024-10-31T04:20:16.9052959Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "27790d83-509f-4716-bb69-9c288ad069ba", - "queued_time": "2024-10-31T04:20:15.1325692Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 26, - "statement_ids": [ - 26 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 26, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.chains import LLMChain\n", - "\n", - "chain = LLMChain(llm=llm, prompt=copy_prompt)\n" - ] - }, - { - "cell_type": "markdown", - "id": "936b3ddf-cc65-436c-ba4e-ae0abe21fc2c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Set up the LangChain transformer to execute the processing chain\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "63a00038-37b4-49ee-9c53-128c8acf9d01", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:20:18.181457Z", - "execution_start_time": "2024-10-31T04:20:17.4351576Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "3fb30420-f0c9-477b-ad1a-001dc0d8d37a", - "queued_time": "2024-10-31T04:20:15.6799013Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 27, - "statement_ids": [ - 27 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 27, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from synapse.ml.cognitive.langchain import LangchainTransformer\n", - "\n", - "openai_api_key= os.environ[\"AZURE_OPENAI_API_KEY\"]\n", - "\n", - "transformer = (\n", - " LangchainTransformer()\n", - " .setInputCol(\"technology\")\n", - " .setOutputCol(\"definition\")\n", - " .setChain(chain)\n", - " .setSubscriptionKey(openai_api_key)\n", - " .setUrl(api_base)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "c74293f0-925e-4987-a6a1-b3b9b8e14b9d", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Construct a DataFrame with technology names." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "8e03963e-2fcf-4934-b96f-ac27b4e0353c", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:24:08.3891172Z", - "execution_start_time": "2024-10-31T04:24:02.0675933Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "856f5b73-26e8-4d20-a901-356cd92b9c2a", - "queued_time": "2024-10-31T04:24:01.6603792Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 29, - "statement_ids": [ - 29 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 29, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----+----------+----------------------+\n", - "|label|technology|transformed_technology|\n", - "+-----+----------+----------------------+\n", - "| 0| docker| DOCKER|\n", - "| 1| spark| SPARK|\n", - "| 2| python| PYTHON|\n", - "+-----+----------+----------------------+\n", - "\n" - ] - } - ], - "source": [ - "from pyspark.sql import SparkSession\n", - "from pyspark.sql.functions import udf\n", - "from pyspark.sql.types import StringType\n", - "\n", - "# Initialize Spark session\n", - "spark = SparkSession.builder.appName(\"example\").getOrCreate()\n", - "\n", - "# Construct a DataFrame with technology names\n", - "df = spark.createDataFrame(\n", - " [\n", - " (0, \"docker\"), (1, \"spark\"), (2, \"python\")\n", - " ],\n", - " [\"label\", \"technology\"]\n", - ")\n", - "\n", - "# Define a simple UDF to transform the technology column\n", - "def transform_technology(tech):\n", - " return tech.upper()\n", - "\n", - "# Register the UDF\n", - "transform_udf = udf(transform_technology, StringType())\n", - "\n", - "# Apply the UDF to the DataFrame\n", - "transformed_df = df.withColumn(\"transformed_technology\", transform_udf(df[\"technology\"]))\n", - "\n", - "# Show the transformed DataFrame\n", - "transformed_df.show()" - ] - }, - { - "cell_type": "markdown", - "id": "47ab1ba6-deaf-488d-9e95-8202669d948c", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Extract content from PDFs linked in arXiv papers and generate prompts for extracting specific information.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "8b52c87e-5971-4d28-bc4b-4160d29a1c24", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:27:08.3224773Z", - "execution_start_time": "2024-10-31T04:27:08.0430507Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "4eeab690-4159-41dc-be69-3cceed484314", - "queued_time": "2024-10-31T04:27:07.6309068Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 30, - "statement_ids": [ - 30 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 30, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.document_loaders import OnlinePDFLoader\n", - "\n", - "def paper_content_extraction(inputs: dict) -> dict:\n", - " arxiv_link = inputs[\"arxiv_link\"]\n", - " loader = OnlinePDFLoader(arxiv_link)\n", - " pages = loader.load_and_split()\n", - " return {\"paper_content\": pages[0].page_content + pages[1].page_content}\n", - "\n", - "def prompt_generation(inputs: dict) -> dict:\n", - " output = inputs[\"Output\"]\n", - " prompt = (\n", - " \"find the paper title, author, summary in the paper description below, output them. \"\n", - " \"After that, Use websearch to find out 3 recent papers of the first author in the author section below \"\n", - " \"(first author is the first name separated by comma) and list the paper titles in bullet points: \"\n", - " \"\\n\" + output + \".\"\n", - " )\n", - " return {\"prompt\": prompt}" - ] - }, - { - "cell_type": "markdown", - "id": "89d79c38-ba0c-4062-911c-7ede02536298", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Set up a chain to extract structured information from an arXiv link\n" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "e85241a0-11c2-49c1-9b2e-63187cb24d9a", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:28:11.2331925Z", - "execution_start_time": "2024-10-31T04:28:11.0134852Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "232b4aa0-1b84-47f8-bb5d-347a575d9640", - "queued_time": "2024-10-31T04:28:10.663514Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 31, - "statement_ids": [ - 31 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 31, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from langchain.chains import TransformChain, SimpleSequentialChain\n", - "\n", - "paper_content_extraction_chain = TransformChain(\n", - " input_variables=[\"arxiv_link\"],\n", - " output_variables=[\"paper_content\"],\n", - " transform=paper_content_extraction,\n", - " verbose=False,\n", - ")\n", - "\n", - "paper_summarizer_template = \"\"\"\n", - "You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, \n", - "and extract authors and paper title from the paper content.\n", - "\"\"\"" - ] - }, - { - "cell_type": "markdown", - "id": "64937339-791c-4aad-953b-ca990bfd324a", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use Microsoft Fabric's native integration with the MLflow framework to log the trained machine learning models, the used hyperparameters, and evaluation metrics." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "5bac7684-a123-4733-baa3-a748ff0fd070", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [ - { - "data": { - "application/vnd.livy.statement-meta+json": { - "execution_finish_time": "2024-10-31T04:36:54.8917645Z", - "execution_start_time": "2024-10-31T04:36:44.7561664Z", - "livy_statement_state": "available", - "normalized_state": "finished", - "parent_msg_id": "d2abef17-25d7-41c4-a62f-051d9b5fe8d7", - "queued_time": "2024-10-31T04:36:44.2999954Z", - "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "session_start_time": null, - "spark_pool": null, - "state": "finished", - "statement_id": 33, - "statement_ids": [ - 33 - ] - }, - "text/plain": [ - "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 33, Finished, Available, Finished)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Registered model 'RandomForestRegressionModel' already exists. Creating a new version of this model...\n", - "2024/10/31 04:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressionModel, version 2\n", - "Created version '2' of model 'RandomForestRegressionModel'.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model Name: RandomForestRegressionModel\n", - "Model Version: 2\n" - ] - }, - { - "data": { - "application/vnd.mlflow.run-widget+json": { - "data": { - "metrics": {}, - "params": { - "n_estimators": "3", - "random_state": "42" - }, - "tags": { - "mlflow.rootRunId": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "mlflow.runName": "icy_hamster_xr34qfzf", - "mlflow.user": "4b3a56ea-6f42-450e-b7c3-fb2932c7ac32", - "synapseml.experiment.artifactId": "17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b", - "synapseml.experimentName": "Notebook-1", - "synapseml.livy.id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", - "synapseml.notebook.artifactId": "789d5fef-b2a1-409b-996f-0cdb4e748a90", - "synapseml.user.id": "ea5a1fdc-a08c-493a-bce9-8422f28ecd05", - "synapseml.user.name": "System Administrator" - } - }, - "info": { - "artifact_uri": "sds://onelakewestus3.pbidedicated.windows.net/6361aeaa-b63a-44ea-b28f-26db10b31a6c/17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b/20c75f63-d266-40b1-83f7-d9c76fd1f4f4/artifacts", - "end_time": 1730349412, - "experiment_id": "d52403ad-a9c2-41ba-b582-9b8e9a57917e", - "lifecycle_stage": "active", - "run_id": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "run_name": "", - "run_uuid": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", - "start_time": 1730349405, - "status": "FINISHED", - "user_id": "7ebfac85-3ebb-440f-a743-e52052051f6a" - }, - "inputs": { - "dataset_inputs": [] - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import mlflow\n", - "from mlflow.models import infer_signature\n", - "from sklearn.datasets import make_regression\n", - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "# Generate synthetic regression data\n", - "X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)\n", - "\n", - "# Model parameters\n", - "params = {\"n_estimators\": 3, \"random_state\": 42}\n", - "\n", - "# Model tags for MLflow\n", - "model_tags = {\n", - " \"project_name\": \"grocery-forecasting\",\n", - " \"store_dept\": \"produce\",\n", - " \"team\": \"stores-ml\",\n", - " \"project_quarter\": \"Q3-2023\"\n", - "}\n", - "\n", - "# Log MLflow entities\n", - "with mlflow.start_run() as run:\n", - " # Train the model\n", - " model = RandomForestRegressor(**params).fit(X, y)\n", - "\n", - " # Infer the model signature\n", - " signature = infer_signature(X, model.predict(X))\n", - "\n", - " # Log parameters and the model\n", - " mlflow.log_params(params)\n", - " mlflow.sklearn.log_model(model, artifact_path=\"sklearn-model\", signature=signature)\n", - "\n", - " # Register the model with tags\n", - " model_uri = f\"runs:/{run.info.run_id}/sklearn-model\"\n", - " model_version = mlflow.register_model(model_uri, \"RandomForestRegressionModel\", tags=model_tags)\n", - "\n", - " # Output model registration details\n", - " print(f\"Model Name: {model_version.name}\")\n", - " print(f\"Model Version: {model_version.version}\")" - ] - }, - { - "cell_type": "markdown", - "id": "315ebdcd-e78c-4bc5-93d6-f202d02bddc5", - "metadata": { - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "source": [ - "## Use MLflow to search among multiple models saved within the workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60e6f7d3-d1ec-4ccc-9745-6c7938d2f4bc", - "metadata": { - "jupyter": { - "outputs_hidden": false, - "source_hidden": false - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark" - }, - "nteract": { - "transient": { - "deleting": false - } - } - }, - "outputs": [], - "source": [ - "from pprint import pprint\n", - "from mlflow.tracking import MlflowClient\n", - "\n", - "client = MlflowClient()\n", - "for rm in client.search_registered_models():\n", - " pprint(dict(rm), indent=4)" - ] - } - ], - "metadata": { - "application/vnd.jupyter.widget-state+json": { - "version": "1.0" - }, - "dependencies": { - "environment": { - "environmentId": "766562be-9e21-456c-b270-cac7e4bf8d18", - "workspaceId": "6361aeaa-b63a-44ea-b28f-26db10b31a6c" - } - }, - "kernel_info": { - "name": "synapse_pyspark" - }, - "kernelspec": { - "display_name": "Synapse PySpark", - "language": "Python", - "name": "synapse_pyspark" - }, - "language_info": { - "name": "python" - }, - "microsoft": { - "language": "python", - "language_group": "synapse_pyspark", - "ms_spell_check": { - "ms_spell_check_language": "en" - } - }, - "nteract": { - "version": "nteract-front-end@1.0.0" - }, - "spark_compute": { - "compute_id": "/trident/default", - "session_options": { - "conf": { - "spark.synapse.nbs.session.timeout": "1200000" - } - } - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version": "1.0" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} +{ + "cells": [ + { + "cell_type": "markdown", + "id": "519955e9-2dad-456d-93db-a332d38e9433", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "# Fabric: Highlights into AI/LLMs" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d312e8d9-03fe-4b3d-aa6d-c52e3022ae39", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:26.7170509Z", + "execution_start_time": "2024-10-31T03:58:19.270951Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e267b6ab-5133-4598-8251-d64374cd11e5", + "queued_time": "2024-10-31T03:58:18.9132075Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 5, + "statement_ids": [ + 5 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 5, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: synapseml\r\n", + "Version: 1.0.8\r\n", + "Summary: Synapse Machine Learning\r\n", + "Home-page: https://github.com/Microsoft/SynapseML\r\n", + "Author: Microsoft\r\n", + "Author-email: synapseml-support@microsoft.com\r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: \r\n", + "Required-by: \r\n" + ] + } + ], + "source": [ + "!pip show synapseml" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "427610d0-3fae-45e3-8150-92ee7674f44c", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:28.6254349Z", + "execution_start_time": "2024-10-31T03:58:27.1124616Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "0e9f6c0f-062b-4e5d-9061-afcd89c8fd75", + "queued_time": "2024-10-31T03:58:19.3223486Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 6, + "statement_ids": [ + 6 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 6, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: langchain-openai\r\n", + "Version: 0.2.4\r\n", + "Summary: An integration package connecting OpenAI and LangChain\r\n", + "Home-page: https://github.com/langchain-ai/langchain\r\n", + "Author: \r\n", + "Author-email: \r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: langchain-core, openai, tiktoken\r\n", + "Required-by: \r\n" + ] + } + ], + "source": [ + "!pip show langchain-openai" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "baeeb853-2104-4edf-abf4-4d4be50cb977", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T03:58:30.5465258Z", + "execution_start_time": "2024-10-31T03:58:29.0000586Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "716d9975-263b-4d92-b25c-b342106f5f43", + "queued_time": "2024-10-31T03:58:19.511824Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 7, + "statement_ids": [ + 7 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 7, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: langchain\r\n", + "Version: 0.3.6\r\n", + "Summary: Building applications with LLMs through composability\r\n", + "Home-page: https://github.com/langchain-ai/langchain\r\n", + "Author: \r\n", + "Author-email: \r\n", + "License: MIT\r\n", + "Location: /home/trusted-service-user/cluster-env/clonedenv/lib/python3.11/site-packages\r\n", + "Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity\r\n", + "Required-by: langchain-community\r\n" + ] + } + ], + "source": [ + "!pip show langchain" + ] + }, + { + "cell_type": "markdown", + "id": "c58cc406-c4f5-4607-a740-0802e8e4b550", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Ensure you have the API key and endpoint URL for your deployed model. Set these as environment variables" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3c8ada7c-2632-4c69-86d2-f5260ee8f1b7", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:14.3495341Z", + "execution_start_time": "2024-10-31T04:20:14.1128215Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "2573bf75-fe6d-40dc-b9f6-e06ebb9f7f73", + "queued_time": "2024-10-31T04:20:13.6194485Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 22, + "statement_ids": [ + 22 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 22, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_VERSION\"] = \"2023-08-01-preview\"\n", + "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://your-resource.openai.azure.com/openai/deployments/gpt-4o-mini/chat/completions?api-version=2024-08-01-preview\"\n", + "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"your-value\"" + ] + }, + { + "cell_type": "markdown", + "id": "3fac48a9-45fb-4e86-9792-8ee340b0ac60", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create an instance of the Azure OpenAI class using the environment variables set above" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5db10350-8000-4cbd-9bdf-d7da62d7fe61", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:14.9382032Z", + "execution_start_time": "2024-10-31T04:20:14.7083469Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "7dfaca5a-f738-4010-bba1-f764ea70f450", + "queued_time": "2024-10-31T04:20:14.027325Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 23, + "statement_ids": [ + 23 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 23, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain_openai import AzureChatOpenAI\n", + "\n", + "# Set the API base URL\n", + "api_base = os.environ[\"AZURE_OPENAI_ENDPOINT\"]\n", + "\n", + "# Create an instance of the Azure OpenAI Class\n", + "llm = AzureChatOpenAI(\n", + " openai_api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n", + " temperature=0.7,\n", + " verbose=True,\n", + " top_p=0.9\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b17d7450-34b5-4ece-8e20-a77ddcdd93c4", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use the Azure OpenAI service to generate text or perform other language model tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "cfc5fd62-085a-4eff-9192-696d9f249a8e", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:16.0500538Z", + "execution_start_time": "2024-10-31T04:20:15.2936074Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "e14e4d0b-1fd0-4dac-a07d-6479d6536ce3", + "queued_time": "2024-10-31T04:20:14.4969185Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 24, + "statement_ids": [ + 24 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 24, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "content='Salut, comment \u00e7a va ?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 6, 'prompt_tokens': 33, 'total_tokens': 39, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-mini', 'system_fingerprint': 'fp_d54531d9eb', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'protected_material_code': {'filtered': False, 'detected': False}, 'protected_material_text': {'filtered': False, 'detected': False}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}} id='run-8cb7f29a-44c1-4f65-a648-15afb2d793dc-0' usage_metadata={'input_tokens': 33, 'output_tokens': 6, 'total_tokens': 39, 'input_token_details': {}, 'output_token_details': {}}\n" + ] + } + ], + "source": [ + "# Define a prompt\n", + "messages = [\n", + " (\n", + " \"system\",\n", + " \"You are a helpful assistant that translates English to French. Translate the user sentence.\",\n", + " ),\n", + " (\"human\", \"Hi, how are you?\"),\n", + "]\n", + "\n", + "# Generate a response from the Azure OpenAI service using the invoke method\n", + "ai_msg = llm.invoke(messages)\n", + "\n", + "# Print the response\n", + "print(ai_msg)" + ] + }, + { + "cell_type": "markdown", + "id": "79729106-c7f1-4879-bc2b-871b50c2ac9a", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Define a prompt template for generating definitions" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ca633361-c27b-4294-b8a7-9fc4a316afa4", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:16.587491Z", + "execution_start_time": "2024-10-31T04:20:16.3655978Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "cc3215f4-71a5-4231-af47-9bd9a8f5698a", + "queued_time": "2024-10-31T04:20:14.7799392Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 25, + "statement_ids": [ + 25 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 25, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "copy_prompt = PromptTemplate(\n", + " input_variables=[\"technology\"],\n", + " template=\"Define the following word: {technology}\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "899839d9-adca-4042-b662-73edcad7e432", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Create an LLMChain with the defined prompt template" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "bd4f65ca-049b-481d-bbbd-a017c6c0119b", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:17.1233668Z", + "execution_start_time": "2024-10-31T04:20:16.9052959Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "27790d83-509f-4716-bb69-9c288ad069ba", + "queued_time": "2024-10-31T04:20:15.1325692Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 26, + "statement_ids": [ + 26 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 26, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.chains import LLMChain\n", + "\n", + "chain = LLMChain(llm=llm, prompt=copy_prompt)\n" + ] + }, + { + "cell_type": "markdown", + "id": "936b3ddf-cc65-436c-ba4e-ae0abe21fc2c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Set up the LangChain transformer to execute the processing chain\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "63a00038-37b4-49ee-9c53-128c8acf9d01", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:20:18.181457Z", + "execution_start_time": "2024-10-31T04:20:17.4351576Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "3fb30420-f0c9-477b-ad1a-001dc0d8d37a", + "queued_time": "2024-10-31T04:20:15.6799013Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 27, + "statement_ids": [ + 27 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 27, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from synapse.ml.cognitive.langchain import LangchainTransformer\n", + "\n", + "openai_api_key= os.environ[\"AZURE_OPENAI_API_KEY\"]\n", + "\n", + "transformer = (\n", + " LangchainTransformer()\n", + " .setInputCol(\"technology\")\n", + " .setOutputCol(\"definition\")\n", + " .setChain(chain)\n", + " .setSubscriptionKey(openai_api_key)\n", + " .setUrl(api_base)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c74293f0-925e-4987-a6a1-b3b9b8e14b9d", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Construct a DataFrame with technology names." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8e03963e-2fcf-4934-b96f-ac27b4e0353c", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:24:08.3891172Z", + "execution_start_time": "2024-10-31T04:24:02.0675933Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "856f5b73-26e8-4d20-a901-356cd92b9c2a", + "queued_time": "2024-10-31T04:24:01.6603792Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 29, + "statement_ids": [ + 29 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 29, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+----------+----------------------+\n", + "|label|technology|transformed_technology|\n", + "+-----+----------+----------------------+\n", + "| 0| docker| DOCKER|\n", + "| 1| spark| SPARK|\n", + "| 2| python| PYTHON|\n", + "+-----+----------+----------------------+\n", + "\n" + ] + } + ], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import udf\n", + "from pyspark.sql.types import StringType\n", + "\n", + "# Initialize Spark session\n", + "spark = SparkSession.builder.appName(\"example\").getOrCreate()\n", + "\n", + "# Construct a DataFrame with technology names\n", + "df = spark.createDataFrame(\n", + " [\n", + " (0, \"docker\"), (1, \"spark\"), (2, \"python\")\n", + " ],\n", + " [\"label\", \"technology\"]\n", + ")\n", + "\n", + "# Define a simple UDF to transform the technology column\n", + "def transform_technology(tech):\n", + " return tech.upper()\n", + "\n", + "# Register the UDF\n", + "transform_udf = udf(transform_technology, StringType())\n", + "\n", + "# Apply the UDF to the DataFrame\n", + "transformed_df = df.withColumn(\"transformed_technology\", transform_udf(df[\"technology\"]))\n", + "\n", + "# Show the transformed DataFrame\n", + "transformed_df.show()" + ] + }, + { + "cell_type": "markdown", + "id": "47ab1ba6-deaf-488d-9e95-8202669d948c", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Extract content from PDFs linked in arXiv papers and generate prompts for extracting specific information.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8b52c87e-5971-4d28-bc4b-4160d29a1c24", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:27:08.3224773Z", + "execution_start_time": "2024-10-31T04:27:08.0430507Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "4eeab690-4159-41dc-be69-3cceed484314", + "queued_time": "2024-10-31T04:27:07.6309068Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 30, + "statement_ids": [ + 30 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 30, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.document_loaders import OnlinePDFLoader\n", + "\n", + "def paper_content_extraction(inputs: dict) -> dict:\n", + " arxiv_link = inputs[\"arxiv_link\"]\n", + " loader = OnlinePDFLoader(arxiv_link)\n", + " pages = loader.load_and_split()\n", + " return {\"paper_content\": pages[0].page_content + pages[1].page_content}\n", + "\n", + "def prompt_generation(inputs: dict) -> dict:\n", + " output = inputs[\"Output\"]\n", + " prompt = (\n", + " \"find the paper title, author, summary in the paper description below, output them. \"\n", + " \"After that, Use websearch to find out 3 recent papers of the first author in the author section below \"\n", + " \"(first author is the first name separated by comma) and list the paper titles in bullet points: \"\n", + " \"\\n\" + output + \".\"\n", + " )\n", + " return {\"prompt\": prompt}" + ] + }, + { + "cell_type": "markdown", + "id": "89d79c38-ba0c-4062-911c-7ede02536298", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Set up a chain to extract structured information from an arXiv link\n" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "e85241a0-11c2-49c1-9b2e-63187cb24d9a", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:28:11.2331925Z", + "execution_start_time": "2024-10-31T04:28:11.0134852Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "232b4aa0-1b84-47f8-bb5d-347a575d9640", + "queued_time": "2024-10-31T04:28:10.663514Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 31, + "statement_ids": [ + 31 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 31, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from langchain.chains import TransformChain, SimpleSequentialChain\n", + "\n", + "paper_content_extraction_chain = TransformChain(\n", + " input_variables=[\"arxiv_link\"],\n", + " output_variables=[\"paper_content\"],\n", + " transform=paper_content_extraction,\n", + " verbose=False,\n", + ")\n", + "\n", + "paper_summarizer_template = \"\"\"\n", + "You are a paper summarizer, given the paper content, it is your job to summarize the paper into a short summary, \n", + "and extract authors and paper title from the paper content.\n", + "\"\"\"" + ] + }, + { + "cell_type": "markdown", + "id": "64937339-791c-4aad-953b-ca990bfd324a", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use Microsoft Fabric's native integration with the MLflow framework to log the trained machine learning models, the used hyperparameters, and evaluation metrics." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5bac7684-a123-4733-baa3-a748ff0fd070", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [ + { + "data": { + "application/vnd.livy.statement-meta+json": { + "execution_finish_time": "2024-10-31T04:36:54.8917645Z", + "execution_start_time": "2024-10-31T04:36:44.7561664Z", + "livy_statement_state": "available", + "normalized_state": "finished", + "parent_msg_id": "d2abef17-25d7-41c4-a62f-051d9b5fe8d7", + "queued_time": "2024-10-31T04:36:44.2999954Z", + "session_id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "session_start_time": null, + "spark_pool": null, + "state": "finished", + "statement_id": 33, + "statement_ids": [ + 33 + ] + }, + "text/plain": [ + "StatementMeta(, 7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325, 33, Finished, Available, Finished)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'RandomForestRegressionModel' already exists. Creating a new version of this model...\n", + "2024/10/31 04:36:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressionModel, version 2\n", + "Created version '2' of model 'RandomForestRegressionModel'.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model Name: RandomForestRegressionModel\n", + "Model Version: 2\n" + ] + }, + { + "data": { + "application/vnd.mlflow.run-widget+json": { + "data": { + "metrics": {}, + "params": { + "n_estimators": "3", + "random_state": "42" + }, + "tags": { + "mlflow.rootRunId": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "mlflow.runName": "icy_hamster_xr34qfzf", + "mlflow.user": "4b3a56ea-6f42-450e-b7c3-fb2932c7ac32", + "synapseml.experiment.artifactId": "17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b", + "synapseml.experimentName": "Notebook-1", + "synapseml.livy.id": "7383b5d4-1dea-4b9b-85d6-fe5ef5b7d325", + "synapseml.notebook.artifactId": "789d5fef-b2a1-409b-996f-0cdb4e748a90", + "synapseml.user.id": "ea5a1fdc-a08c-493a-bce9-8422f28ecd05", + "synapseml.user.name": "System Administrator" + } + }, + "info": { + "artifact_uri": "sds://onelakewestus3.pbidedicated.windows.net/6361aeaa-b63a-44ea-b28f-26db10b31a6c/17b41ab7-b0e0-4adc-9fc9-403dd72b6e5b/20c75f63-d266-40b1-83f7-d9c76fd1f4f4/artifacts", + "end_time": 1730349412, + "experiment_id": "d52403ad-a9c2-41ba-b582-9b8e9a57917e", + "lifecycle_stage": "active", + "run_id": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "run_name": "", + "run_uuid": "20c75f63-d266-40b1-83f7-d9c76fd1f4f4", + "start_time": 1730349405, + "status": "FINISHED", + "user_id": "7ebfac85-3ebb-440f-a743-e52052051f6a" + }, + "inputs": { + "dataset_inputs": [] + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import mlflow\n", + "from mlflow.models import infer_signature\n", + "from sklearn.datasets import make_regression\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "\n", + "# Generate synthetic regression data\n", + "X, y = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)\n", + "\n", + "# Model parameters\n", + "params = {\"n_estimators\": 3, \"random_state\": 42}\n", + "\n", + "# Model tags for MLflow\n", + "model_tags = {\n", + " \"project_name\": \"grocery-forecasting\",\n", + " \"store_dept\": \"produce\",\n", + " \"team\": \"stores-ml\",\n", + " \"project_quarter\": \"Q3-2023\"\n", + "}\n", + "\n", + "# Log MLflow entities\n", + "with mlflow.start_run() as run:\n", + " # Train the model\n", + " model = RandomForestRegressor(**params).fit(X, y)\n", + "\n", + " # Infer the model signature\n", + " signature = infer_signature(X, model.predict(X))\n", + "\n", + " # Log parameters and the model\n", + " mlflow.log_params(params)\n", + " mlflow.sklearn.log_model(model, artifact_path=\"sklearn-model\", signature=signature)\n", + "\n", + " # Register the model with tags\n", + " model_uri = f\"runs:/{run.info.run_id}/sklearn-model\"\n", + " model_version = mlflow.register_model(model_uri, \"RandomForestRegressionModel\", tags=model_tags)\n", + "\n", + " # Output model registration details\n", + " print(f\"Model Name: {model_version.name}\")\n", + " print(f\"Model Version: {model_version.version}\")" + ] + }, + { + "cell_type": "markdown", + "id": "315ebdcd-e78c-4bc5-93d6-f202d02bddc5", + "metadata": { + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "source": [ + "## Use MLflow to search among multiple models saved within the workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60e6f7d3-d1ec-4ccc-9745-6c7938d2f4bc", + "metadata": { + "jupyter": { + "outputs_hidden": false, + "source_hidden": false + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark" + }, + "nteract": { + "transient": { + "deleting": false + } + } + }, + "outputs": [], + "source": [ + "from pprint import pprint\n", + "from mlflow.tracking import MlflowClient\n", + "\n", + "client = MlflowClient()\n", + "for rm in client.search_registered_models():\n", + " pprint(dict(rm), indent=4)" + ] + } + ], + "metadata": { + "application/vnd.jupyter.widget-state+json": { + "version": "1.0" + }, + "dependencies": { + "environment": { + "environmentId": "766562be-9e21-456c-b270-cac7e4bf8d18", + "workspaceId": "6361aeaa-b63a-44ea-b28f-26db10b31a6c" + } + }, + "kernel_info": { + "name": "synapse_pyspark" + }, + "kernelspec": { + "display_name": "Synapse PySpark", + "language": "Python", + "name": "synapse_pyspark" + }, + "language_info": { + "name": "python" + }, + "microsoft": { + "language": "python", + "language_group": "synapse_pyspark", + "ms_spell_check": { + "ms_spell_check_language": "en" + } + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + }, + "spark_compute": { + "compute_id": "/trident/default", + "session_options": { + "conf": { + "spark.synapse.nbs.session.timeout": "1200000" + } + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file