From 711212d71b4cd37273b5bcb4720631cc9ee409eb Mon Sep 17 00:00:00 2001 From: brown9804 Date: Sun, 23 Nov 2025 21:30:37 -0600 Subject: [PATCH 01/14] deploy infra + connections + prepare demo data --- .gitignore | 3 + TROUBLESHOOTING.md | 331 +++++++++++++++++ src/DATA_PIPELINE.md | 265 ++++++++++++++ src/data/updated_product_catalog(in).csv | 21 ++ src/pipelines/create_search_index.py | 119 ++++++ src/pipelines/import_to_search.py | 149 ++++++++ src/pipelines/ingest_to_cosmos.py | 135 +++++++ src/pipelines/upload_to_search.py | 122 +++++++ src/requirements.txt | 8 + src/verify_data.py | 24 ++ src/verify_search.py | 27 ++ terraform-infrastructure/main.tf | 424 ++++++++++++++++++---- terraform-infrastructure/terraform.tfvars | 2 +- terraform-infrastructure/variables.tf | 7 + 14 files changed, 1566 insertions(+), 71 deletions(-) create mode 100644 TROUBLESHOOTING.md create mode 100644 src/DATA_PIPELINE.md create mode 100644 src/data/updated_product_catalog(in).csv create mode 100644 src/pipelines/create_search_index.py create mode 100644 src/pipelines/import_to_search.py create mode 100644 src/pipelines/ingest_to_cosmos.py create mode 100644 src/pipelines/upload_to_search.py create mode 100644 src/requirements.txt create mode 100644 src/verify_data.py create mode 100644 src/verify_search.py diff --git a/.gitignore b/.gitignore index 6349e36..c95816f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # Local .terraform directories .terraform/ +*.terraform.lock.hcl +.terraform.lock.hcl +*src/.env # .tfstate files *.tfstate diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..d4305a8 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,331 @@ +# Troubleshooting Guide - Overview + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-11-12 + +---------- + +> This guide covers common issues you may encounter when deploying and running this Azure AI Shopping demo application. + +## Table of Contents +- [Python Environment Issues](#python-environment-issues) +- [Azure Authentication Issues](#azure-authentication-issues) +- [Cosmos DB Issues](#cosmos-db-issues) +- [Data Pipeline Issues](#data-pipeline-issues) +- [Terraform Issues](#terraform-issues) + +--- + +## Python Environment Issues + +### Python Not Found +``` +ERROR: Python is not installed or not in PATH +``` + +**Solution**: +- Install Python 3.8+ from https://www.python.org/downloads/ +- Ensure Python is added to your system PATH during installation +- Verify installation: `python --version` + +### Virtual Environment Creation Failed +``` +ERROR: Failed to create virtual environment +``` + +**Solution**: +- Ensure you have write permissions to the `src` directory +- Try deleting existing `venv` folder: `Remove-Item -Recurse -Force venv` +- Check if `python -m venv` works manually: `python -m venv test_venv` +- On Windows, ensure your execution policy allows script execution + +### Package Installation Failed +``` +ERROR: Could not install packages due to an OSError +``` + +**Solution**: +- Update pip: `python -m pip install --upgrade pip` +- Clear pip cache: `pip cache purge` +- Try installing with `--no-cache-dir`: `pip install --no-cache-dir -r requirements.txt` +- For Windows + pandas issues, use pre-built wheels by ensuring `pandas>=2.2.2` in requirements.txt + +--- + +## Azure Authentication Issues + +### Not Logged into Azure CLI +``` +ERROR: Please run 'az login' to setup account +``` + +**Solution**: +```powershell +# Login to Azure CLI +az login + +# Verify you're logged in with the correct account +az account show + +# If needed, set the correct subscription +az account set --subscription +``` + +### AAD Authentication Failed +``` +DefaultAzureCredential failed to retrieve a token +``` + +**Solution**: +1. Ensure you're logged into Azure CLI: `az login` +2. Check your account has proper permissions +3. Verify the resource exists and you have access +4. Try clearing Azure credentials cache: `az account clear` then `az login` again + +--- + +## Cosmos DB Issues + +### Local Authorization Disabled Error +``` +ERROR: Local Authorization is disabled. Use an AAD token to authorize all requests. +``` + +This error occurs when Cosmos DB requires Azure Active Directory (AAD) authentication instead of key-based authentication. + +**Common Causes and Solutions**: + +#### 1. Not logged into Azure CLI + +```powershell +# Login to Azure CLI +az login + +# Verify you're logged in with the correct account +az account show + +# If needed, set the correct subscription +az account set --subscription +``` + +After logging in, try running the script again. + +#### 2. Public Network Access Disabled + +If your Cosmos DB has public network access disabled, your local machine or Codespace VM cannot connect. + +**Solution via Azure Portal**: +- Navigate to your Cosmos DB account in the Azure portal +- Select **Networking** from the Settings menu +- Ensure **Public network access** is set to **All networks** +- Click **Save** +- Wait a few minutes for the change to propagate +- Try running the script again + +**Solution via Azure CLI**: +```powershell +az cosmosdb update \ + --name \ + --resource-group \ + --enable-public-network true +``` + +#### 3. Insufficient Permissions + +Your Azure account needs appropriate role assignments on the Cosmos DB account. + +**Required roles**: +- `Cosmos DB Built-in Data Contributor` (for read/write access) +- Or `Contributor` at the resource group level + +**Solution via Azure CLI**: +```powershell +# Get your user object ID +$userId = (az ad signed-in-user show --query id -o tsv) + +# Assign Cosmos DB Data Contributor role +az cosmosdb sql role assignment create \ + --account-name \ + --resource-group \ + --role-definition-id 00000000-0000-0000-0000-000000000002 \ + --principal-id $userId \ + --scope "/" +``` + +### Connection Timeout +``` +ERROR: Request timeout +``` + +**Solution**: +- Check your network connection +- Verify Cosmos DB firewall settings allow your IP address +- Ensure public network access is enabled (see above) +- Check if Azure services are experiencing outages: https://status.azure.com/ + +--- + +## Data Pipeline Issues + +### CSV File Not Found +``` +WARNING: CSV data file not found at data/updated_product_catalog(in).csv +``` + +**Solution**: +Download or place the product catalog CSV file in the `src/data/` directory: + +```bash +curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/main/src/data/updated_product_catalog(in).csv +``` + +### CSV Parsing Error +``` +ERROR: Error tokenizing data. C error: Expected X fields, saw Y +``` + +**Solution**: +- Ensure CSV fields with commas are properly quoted +- Check for special characters or encoding issues +- Verify the CSV has the correct number of columns (6): ProductID, ProductName, ProductCategory, ProductDescription, Price, ImageUrl +- Try opening the CSV in a text editor to check for formatting issues + +### Environment File Missing +``` +ERROR: .env file not found +``` + +**Solution**: +```bash +# Run Terraform to generate the .env file +cd terraform-infrastructure +terraform apply -auto-approve +``` + +### Failed to Authenticate to Cosmos DB +``` +ERROR: Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided +``` + +**Solution**: +- Ensure your `.env` file is properly generated with correct keys +- Run `terraform apply` again if needed +- Check that `COSMOS_DB_ENDPOINT` and `COSMOS_DB_KEY` are set correctly in `.env` +- The script will automatically try AAD authentication first, then fall back to key-based auth + +--- + +## Terraform Issues + +### Resource Already Exists +``` +ERROR: A resource with the ID already exists +``` + +**Solution**: +- Import the existing resource: `terraform import . ` +- Or destroy and recreate: `terraform destroy` then `terraform apply` +- Check for resources in other resource groups with the same name + +### Insufficient Permissions +``` +ERROR: The client does not have authorization to perform action +``` + +**Solution**: +- Ensure your Azure account has `Contributor` or `Owner` role on the subscription or resource group +- Check if specific Azure policies are blocking resource creation +- Contact your Azure administrator to grant necessary permissions + +### Provider Configuration Error +``` +ERROR: Error configuring the backend "azurerm" +``` + +**Solution**: +- Verify your Azure credentials are configured: `az login` +- Check that the specified subscription exists and you have access +- Ensure the backend storage account and container exist (if using remote state) + +### State Lock Error +``` +ERROR: Error acquiring the state lock +``` + +**Solution**: +```bash +# Force unlock (use with caution) +terraform force-unlock +``` + +Only force-unlock if you're certain no other Terraform process is running. + +--- + +## General Tips + +### Enable Verbose Logging + +For more detailed error information: + +**Azure CLI**: +```powershell +az --debug +``` + +**Python Scripts**: +Set environment variable before running: +```powershell +$env:AZURE_LOG_LEVEL = "DEBUG" +python pipelines/script.py +``` + +**Terraform**: +```bash +export TF_LOG=DEBUG +terraform apply +``` + +### Check Azure Service Health + +If experiencing unexpected issues, check Azure service status: +- https://status.azure.com/ + +### Clean Up and Retry + +> Sometimes a clean slate helps: + +```bash +# Clean Python environment +Remove-Item -Recurse -Force venv +python -m venv venv + +# Clean Terraform state (use with caution) +terraform destroy +Remove-Item -Recurse -Force .terraform +terraform init +terraform apply +``` + +--- + +## Still Having Issues? + +> If you continue experiencing problems: + +1. Check the [GitHub repository issues](https://github.com/MicrosoftCloudEssentials-LearningHub/Agentic-DevOps-AI-Shopping/issues) +2. Review Azure documentation for specific services +3. Enable detailed logging as described above +4. Collect error messages, logs, and configuration details +5. Create a new issue with detailed information about your problem + + +
+ Total views +

Refresh Date: 2025-11-12

+
+ \ No newline at end of file diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md new file mode 100644 index 0000000..8b50807 --- /dev/null +++ b/src/DATA_PIPELINE.md @@ -0,0 +1,265 @@ +# Data Pipeline Automation - Overview + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-11-12 + +---------- + +> This automation handles the complete data pipeline setup for the Azure AI Shopping application. + +> [!NOTE] +> What It Does: +> The data pipeline automation performs the following tasks: +> 1. **Creates Python Virtual Environment**: Sets up an isolated Python environment with all required dependencies +> 2. **Imports Data to Cosmos DB**: Loads product catalog data from CSV into Cosmos DB container +> 3. **Creates Azure AI Search Index**: Sets up a search index with vector search capabilities +> 4. **Imports Data to Search**: Populates the search index from Cosmos DB using an indexer + + +
+ Prerequisites: (Click to expand) + +- Python 3.8 or higher installed and available in PATH +- Product catalog CSV file at `src/data/updated_product_catalog(in).csv` (demo) + +
+ +> Automated by Terraform: + +- Cosmos DB account and database +- Azure AI Search service +- Azure OpenAI model deployments +- Environment variables in `src/.env` + +## Usage + +> Option 1: Run Automatically with Terraform → Enable data pipeline automation in `terraform.tfvars`: + +```hcl +enable_data_pipeline = true +``` + +Then run: + +```bash +terraform apply -auto-approve +``` + +This will: +- Deploy all Azure resources +- Create AI model deployments +- Generate `.env` file +- **Automatically run the complete data pipeline** + +> Option 2: Run Manually → If you prefer to run the data pipeline manually or separately: + +1. **Ensure `.env` file exists** (created by Terraform): + ```bash + cd terraform-infrastructure + terraform apply -auto-approve + ``` + +2. **Navigate to src directory**: + ```bash + cd ../src + ``` + +3. **Create virtual environment and install dependencies**: + ```powershell + python -m venv venv + .\venv\Scripts\Activate.ps1 + pip install --upgrade pip + pip install -r requirements.txt + ``` + +4. **Run pipeline scripts in order**: + ```powershell + # Step 1: Import data to Cosmos DB + python pipelines/ingest_to_cosmos.py + + # Step 2: Create Azure AI Search index + python pipelines/create_search_index.py + + # Step 3: Upload data to search index + python pipelines/upload_to_search.py + ``` + +## Data Files + +> Product Catalog CSV → The product catalog data should be placed at: + +``` +src/data/updated_product_catalog(in).csv +``` + +> Expected columns: +- `ProductID`: Unique product identifier +- `ProductName`: Product name +- `ProductCategory`: Product category +- `ProductDescription`: Product description +- `ProductPrice`: Product price +- `ProductImageURL`: URL to product image + +> Download Data → If you don't have the data file, you can download it from the reference repository [TechWorkshop-L300-AI-Apps-and-agents](https://github.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/tree/main), please feel free to follow the guide as well [Guide - TechWorkshop L300: AI Apps and Agents](https://microsoft.github.io/TechWorkshop-L300-AI-Apps-and-agents/): + +```bash +# Download the product catalog data +curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/main/src/data/updated_product_catalog(in).csv +``` + +## Scripts + + +
+ pipelines/ingest_to_cosmos.py (Click to expand) + +- Reads CSV data with product catalog +- Connects to Cosmos DB (uses AAD or key-based auth) +- Creates database and container if they don't exist +- Imports all products with upsert operations +- Creates `content_for_vector` field for semantic search +- **Smart Skip Logic**: + - By default (`COSMOS_SKIP_IF_EXISTS=true`), checks if container already has data + - If data exists, skips import to avoid duplicates and save time + - Set `COSMOS_FORCE_INGEST=true` to force re-import even if data exists + - Set `COSMOS_SKIP_IF_EXISTS=false` to always import (legacy behavior) + +
+ + +
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search +- Configures HNSW algorithm for vector search +- Sets up Azure OpenAI vectorizer +- Defines searchable and filterable fields + +
+ + +
+
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search capabilities +- Configures HNSW algorithm for efficient vector similarity search +- Sets up Azure OpenAI vectorizer with text-embedding-3-small model +- Defines searchable, filterable, and vector fields +- Supports hybrid search (keyword + semantic) + +
+ + +
+
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search +- Configures HNSW algorithm for vector search +- Sets up Azure OpenAI vectorizer +- Defines searchable and filterable fields + +
+ + +
+ pipelines/upload_to_search.py (Click to expand) + +- Reads all documents from Cosmos DB container +- Authenticates using AAD or key-based auth (auto-fallback) +- Maps Cosmos DB fields to Azure AI Search index schema +- Uploads documents in batches to Azure AI Search +- Provides detailed success/failure reporting +- **Note**: This script replaces the traditional indexer approach to avoid managed identity complexity when Cosmos DB local auth is disabled + +
+ +## Troubleshooting + +> For detailed troubleshooting guidance, see [TROUBLESHOOTING.md](../TROUBLESHOOTING.md). Quick Reference: + +- **Python Not Found**: Install Python 3.8+ from https://www.python.org/downloads/ +- **CSV File Not Found**: Download the product catalog CSV file and place it in `src/data/` directory +- **Authentication Errors**: Run `az login` and ensure you have proper permissions. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#azure-authentication-issues) for detailed solutions. +- **Virtual Environment Issues**: Delete `venv` folder and recreate. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#python-environment-issues) for details. + +## Configuration + +> All configuration is pulled from the `.env` file created by Terraform: + +```bash +COSMOS_DB_ENDPOINT=... +COSMOS_DB_KEY=... +COSMOS_DB_NAME=... +COSMOS_DB_CONTAINER_NAME=products +COSMOS_SKIP_IF_EXISTS=true # Skip import if data already exists +COSMOS_FORCE_INGEST=false # Force re-import even if data exists +SEARCH_SERVICE_ENDPOINT=... +SEARCH_SERVICE_KEY=... +SEARCH_INDEX_NAME=products-index +AZURE_OPENAI_ENDPOINT=... +AZURE_OPENAI_API_KEY=... +AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small +``` + +### Environment Variable Reference + +| Variable | Default | Description | +|----------|---------|-------------| +| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | +| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | +| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | +| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | +| `COSMOS_DB_NAME` | - | Database name | +| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | + +## Verification + +> After running the pipeline, verify data was imported: + +### Check Cosmos DB +```powershell +az cosmosdb sql container show \ + --account-name \ + --database-name zava \ + --name products \ + --resource-group +``` + +### Check Search Index +```powershell +az search index show \ + --index-name products-index \ + --service-name \ + --resource-group +``` + +### Query Search Index +```powershell +az search index show-statistics \ + --index-name products-index \ + --service-name \ + --resource-group +``` + +## Next Steps + +> After the data pipeline completes: + +1. Your Cosmos DB container is populated with product data +2. Azure AI Search index is created with vector search enabled +3. Search index is populated from Cosmos DB +4. You can now build AI agents that query this data +5. Use the search index for hybrid search (keyword + semantic) + + +
+ Total views +

Refresh Date: 2025-11-12

+
+ \ No newline at end of file diff --git a/src/data/updated_product_catalog(in).csv b/src/data/updated_product_catalog(in).csv new file mode 100644 index 0000000..426e823 --- /dev/null +++ b/src/data/updated_product_catalog(in).csv @@ -0,0 +1,21 @@ +ProductID,ProductName,ProductCategory,ProductDescription,Price,ImageUrl +1001,Zava Smart Speaker,Electronics,Voice-controlled speaker with high-fidelity audio and edge AI noise suppression,89.99,https://example.com/images/speaker.jpg +1002,Zava Wireless Earbuds,Electronics,Comfort-fit earbuds with adaptive EQ and 30h battery life,129.00,https://example.com/images/earbuds.jpg +1003,Zava Fitness Tracker,Sports,Water-resistant tracker with heart rate variability and sleep stage insights,59.50,https://example.com/images/tracker.jpg +1004,Zava Running Shoes,Sports,Breathable mesh performance shoes with responsive foam sole,104.95,https://example.com/images/runningshoes.jpg +1005,Zava Cotton Hoodie,Apparel,Ultra-soft recycled cotton hoodie with antimicrobial treatment,54.99,https://example.com/images/hoodie.jpg +1006,Zava Insulated Bottle,Home,Double-wall stainless steel bottle keeps drinks cold 24h / hot 12h,28.00,https://example.com/images/bottle.jpg +1007,Zava Ceramic Mug,Home,Matte glaze 14oz mug safe for dishwasher and microwave,12.75,https://example.com/images/mug.jpg +1008,Zava Multi-Tool Outdoor,Sports,Compact 11-in-1 stainless multi-tool with locking blades,36.40,https://example.com/images/multitool.jpg +1009,Zava Hair Serum,Beauty,Nutrient-rich lightweight serum for frizz control and shine,24.50,https://example.com/images/hairserum.jpg +1010,Zava Vitamin C Gummies,Grocery,Non-GMO vegan gummies with natural citrus flavor (90 count),18.99,https://example.com/images/vitaminc.jpg +1011,Zava Gaming Mouse,Electronics,Customizable RGB ergonomic mouse with 12K DPI sensor,64.00,https://example.com/images/mouse.jpg +1012,Zava Mechanical Keyboard,Electronics,"Hot-swap switches, per-key lighting, and PBT keycaps",139.95,https://example.com/images/keyboard.jpg +1013,Zava Desk Lamp,Home,Adjustable LED lamp with ambient backlight and USB-C charging port,42.25,https://example.com/images/desklamp.jpg +1014,Zava Noise Masking Device,Electronics,Generates adaptive ambient sound for focus and sleep environments,79.99,https://example.com/images/noisemask.jpg +1015,Zava Travel Backpack,Apparel,Weather-resistant 28L backpack with laptop sleeve and hidden pocket,98.00,https://example.com/images/backpack.jpg +1016,Zava Smart Plug,Electronics,Energy monitoring smart plug with over-current protection,19.95,https://example.com/images/smartplug.jpg +1017,Zava LED Strip Kit,Electronics,16M color Wi-Fi LED strip with music sync mode,34.50,https://example.com/images/ledstrip.jpg +1018,Zava Foam Roller,Sports,"High-density recovery roller improves circulation and muscle release",25.00,https://example.com/images/foamroller.jpg +1019,Zava Sunscreen SPF50,Beauty,Broad-spectrum mineral sunscreen water-resistant for 80 minutes,21.25,https://example.com/images/sunscreen.jpg +1020,Zava Organic Trail Mix,Grocery,"Blend of roasted nuts, seeds, and dried berries (16oz)",11.49,https://example.com/images/trailmix.jpg \ No newline at end of file diff --git a/src/pipelines/create_search_index.py b/src/pipelines/create_search_index.py new file mode 100644 index 0000000..d9e9333 --- /dev/null +++ b/src/pipelines/create_search_index.py @@ -0,0 +1,119 @@ +import logging +import os +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + SearchIndex, + SearchField, + SearchFieldDataType, + SimpleField, + SearchableField, + VectorSearch, + HnswAlgorithmConfiguration, + VectorSearchProfile, + AzureOpenAIVectorizer, + AzureOpenAIVectorizerParameters +) +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + +# Configuration +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") +AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY") +EMBEDDING_DEPLOYMENT = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small") + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def create_search_index(): + """Create Azure AI Search index with vector search capabilities.""" + + if not SEARCH_ENDPOINT: + raise ValueError("SEARCH_SERVICE_ENDPOINT must be provided in environment variables") + + # Create client + try: + logger.info("Attempting to create Search Index Client...") + if SEARCH_KEY: + credential = AzureKeyCredential(SEARCH_KEY) + else: + credential = DefaultAzureCredential() + + index_client = SearchIndexClient(endpoint=SEARCH_ENDPOINT, credential=credential) + logger.info("Search Index Client created successfully") + except Exception as e: + logger.error(f"Failed to create Search Index Client: {e}") + raise + + # Define the index fields + fields = [ + SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True), + SimpleField(name="ProductID", type=SearchFieldDataType.String, filterable=True), + SearchableField(name="ProductName", type=SearchFieldDataType.String, searchable=True), + SearchableField(name="ProductCategory", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True), + SearchableField(name="ProductDescription", type=SearchFieldDataType.String, searchable=True), + SimpleField(name="ProductPrice", type=SearchFieldDataType.Double, filterable=True, sortable=True), + SimpleField(name="ProductImageURL", type=SearchFieldDataType.String), + SearchableField(name="content_for_vector", type=SearchFieldDataType.String, searchable=True), + SearchField( + name="content_vector", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + vector_search_dimensions=1536, # text-embedding-3-small dimensions + vector_search_profile_name="vector-profile" + ) + ] + + # Configure vector search + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration(name="hnsw-algorithm") + ], + profiles=[ + VectorSearchProfile( + name="vector-profile", + algorithm_configuration_name="hnsw-algorithm", + vectorizer_name="openai-vectorizer" + ) + ], + vectorizers=[ + AzureOpenAIVectorizer( + vectorizer_name="openai-vectorizer", + parameters=AzureOpenAIVectorizerParameters( + resource_url=AZURE_OPENAI_ENDPOINT, + deployment_name=EMBEDDING_DEPLOYMENT, + model_name="text-embedding-3-small", # Required in API version 2025-09-01 + api_key=AZURE_OPENAI_API_KEY + ) + ) + ] + ) + + # Create the search index + index = SearchIndex( + name=INDEX_NAME, + fields=fields, + vector_search=vector_search + ) + + try: + logger.info(f"Creating search index: {INDEX_NAME}...") + result = index_client.create_or_update_index(index) + logger.info(f"Search index '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create search index: {e}") + raise + +def main(): + create_search_index() + logger.info("Search index creation completed successfully") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/import_to_search.py b/src/pipelines/import_to_search.py new file mode 100644 index 0000000..873b6ac --- /dev/null +++ b/src/pipelines/import_to_search.py @@ -0,0 +1,149 @@ +import logging +import os +from azure.cosmos import CosmosClient +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexerClient +from azure.search.documents.indexes.models import ( + SearchIndexerDataSourceConnection, + SearchIndexerDataContainer, + SearchIndexer, + FieldMapping, + IndexingSchedule +) +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv +import time + +load_dotenv() + +# Configuration +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") +DATASOURCE_NAME = f"{INDEX_NAME}-datasource" +INDEXER_NAME = f"{INDEX_NAME}-indexer" + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def create_cosmos_datasource(): + """Create a data source connection to Cosmos DB.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + # Create the data source connection + container = SearchIndexerDataContainer(name=CONTAINER_NAME) + + data_source_connection = SearchIndexerDataSourceConnection( + name=DATASOURCE_NAME, + type="cosmosdb", + connection_string=f"AccountEndpoint={COSMOS_ENDPOINT};AccountKey={COSMOS_KEY};Database={DATABASE_NAME}", + container=container + ) + + try: + logger.info(f"Creating data source: {DATASOURCE_NAME}...") + result = indexer_client.create_or_update_data_source_connection(data_source_connection) + logger.info(f"Data source '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create data source: {e}") + raise + +def create_indexer(): + """Create an indexer to import data from Cosmos DB to Azure AI Search.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + # Create the indexer + indexer = SearchIndexer( + name=INDEXER_NAME, + data_source_name=DATASOURCE_NAME, + target_index_name=INDEX_NAME, + field_mappings=[ + FieldMapping(source_field_name="id", target_field_name="id"), + FieldMapping(source_field_name="ProductID", target_field_name="ProductID"), + FieldMapping(source_field_name="ProductName", target_field_name="ProductName"), + FieldMapping(source_field_name="ProductCategory", target_field_name="ProductCategory"), + FieldMapping(source_field_name="ProductDescription", target_field_name="ProductDescription"), + FieldMapping(source_field_name="ProductPrice", target_field_name="ProductPrice"), + FieldMapping(source_field_name="ProductImageURL", target_field_name="ProductImageURL"), + FieldMapping(source_field_name="content_for_vector", target_field_name="content_for_vector"), + ] + ) + + try: + logger.info(f"Creating indexer: {INDEXER_NAME}...") + result = indexer_client.create_or_update_indexer(indexer) + logger.info(f"Indexer '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create indexer: {e}") + raise + +def run_indexer(): + """Run the indexer to start data import.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + try: + logger.info(f"Running indexer: {INDEXER_NAME}...") + indexer_client.run_indexer(INDEXER_NAME) + logger.info("Indexer started successfully") + + # Wait for indexer to complete + logger.info("Waiting for indexer to complete...") + for i in range(30): # Wait up to 5 minutes + time.sleep(10) + status = indexer_client.get_indexer_status(INDEXER_NAME) + last_result = status.last_result + + if last_result: + logger.info(f"Indexer status: {last_result.status}") + if last_result.status == "success": + logger.info(f"Indexer completed successfully. Indexed {last_result.items_processed} items.") + return + elif last_result.status == "transientFailure" or last_result.status == "persistentFailure": + logger.error(f"Indexer failed: {last_result.error_message}") + raise Exception(f"Indexer failed: {last_result.error_message}") + + logger.warning("Indexer is still running after timeout") + except Exception as e: + logger.error(f"Failed to run indexer: {e}") + raise + +def main(): + # Step 1: Create Cosmos DB data source + create_cosmos_datasource() + + # Step 2: Create indexer + create_indexer() + + # Step 3: Run indexer + run_indexer() + + logger.info("Data import to Azure AI Search completed successfully") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/ingest_to_cosmos.py b/src/pipelines/ingest_to_cosmos.py new file mode 100644 index 0000000..9c84fe3 --- /dev/null +++ b/src/pipelines/ingest_to_cosmos.py @@ -0,0 +1,135 @@ +import logging +import pandas as pd +import os +from azure.cosmos import CosmosClient, PartitionKey +from azure.identity import DefaultAzureCredential +from azure.core.exceptions import AzureError +from dotenv import load_dotenv + +load_dotenv() + +# CONFIGURATIONS - Replace with your actual values +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SKIP_IF_EXISTS = os.environ.get("COSMOS_SKIP_IF_EXISTS", "true").lower() == "true" +FORCE_INGEST = os.environ.get("COSMOS_FORCE_INGEST", "false").lower() == "true" +CSV_FILE = r"data/updated_product_catalog(in).csv" + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def get_cosmos_client(endpoint: str | None, key: str | None = None): + """Try to authenticate to Cosmos DB using DefaultAzureCredential first. + + If that fails, fall back to using the provided key. + Returns a connected CosmosClient instance. + """ + if not endpoint: + raise ValueError("COSMOS_DB_ENDPOINT must be provided in environment variables") + + # Try AAD first + try: + logger.info("Attempting to authenticate to Cosmos DB using DefaultAzureCredential (AAD)...") + credential = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=credential) + + # Perform a light operation to validate the credential + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with DefaultAzureCredential.") + return client + except AzureError as ex: + logger.warning("AAD authentication failed: %s", ex) + + # Fallback to key + if key: + try: + logger.info("Falling back to endpoint + key authentication for Cosmos DB...") + client = CosmosClient(endpoint, key) + # Validate key by a light operation + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with endpoint+key.") + return client + except Exception as ex: + logger.error("Endpoint+key authentication failed: %s", ex) + raise + + # If we reach here, both auth methods failed or no key provided + raise RuntimeError("Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided") + +def main(): + # 1. Read data from CSV + logger.info(f"Reading data from {CSV_FILE}...") + df = pd.read_csv(CSV_FILE, encoding='utf-8', quoting=1) # quoting=1 is csv.QUOTE_ALL + + # Create content for vector search + df['content_for_vector'] = ( + df['ProductName'].fillna('').astype(str) + ' | ' + + df['ProductCategory'].fillna('').astype(str) + ' | ' + + df['ProductDescription'].fillna('').astype(str) + ) + + logger.info(f"Loaded {len(df)} products from CSV") + + # 2. Connect to Cosmos DB + client = get_cosmos_client(COSMOS_ENDPOINT, COSMOS_KEY) + + if not DATABASE_NAME: + raise ValueError("COSMOS_DB_NAME must be provided in environment variables") + + if not CONTAINER_NAME: + raise ValueError("COSMOS_DB_CONTAINER_NAME must be provided in environment variables") + + database = client.create_database_if_not_exists(id=DATABASE_NAME) + logger.info(f"Connected to database: {DATABASE_NAME}") + + container = database.create_container_if_not_exists( + id=CONTAINER_NAME, + partition_key=PartitionKey(path="/ProductID") + ) + logger.info(f"Connected to container: {CONTAINER_NAME}") + + # Check existing item count (lightweight) + existing_count = 0 + try: + count_query = list(container.query_items( + query="SELECT VALUE COUNT(1) FROM c", + enable_cross_partition_query=True + )) + if count_query: + raw_val = count_query[0] + if isinstance(raw_val, dict): + for k in ("$1", "count", "COUNT"): + if k in raw_val: + raw_val = raw_val[k] + break + if isinstance(raw_val, (int, float, str)): + existing_count = int(raw_val) + except Exception as ex: + logger.warning(f"Count query failed (will ignore): {ex}") + + if existing_count > 0 and SKIP_IF_EXISTS and not FORCE_INGEST: + logger.info( + f"Container already has {existing_count} items. Skipping ingestion (SKIP_IF_EXISTS=true, FORCE_INGEST=false)." + ) + return + + # 3. Upload items + logger.info("Starting data upload to Cosmos DB...") + for idx, row in enumerate(df.itertuples(index=False), start=1): + # Convert row to dict + item = row._asdict() + item['id'] = str(item['ProductID']) + item['ProductID'] = str(item['ProductID']) + + # Insert or update item + container.upsert_item(body=item) + if idx % 10 == 0: + logger.info(f"Uploaded {idx}/{len(df)} products") + + logger.info(f"Successfully uploaded all {len(df)} products to Cosmos DB.") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/upload_to_search.py b/src/pipelines/upload_to_search.py new file mode 100644 index 0000000..f689233 --- /dev/null +++ b/src/pipelines/upload_to_search.py @@ -0,0 +1,122 @@ +import logging +import os +from azure.cosmos import CosmosClient +from azure.search.documents import SearchClient +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + +# Configuration +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def get_cosmos_client(endpoint: str, key: str | None = None): + """Get Cosmos DB client with AAD or key-based auth.""" + if not endpoint: + raise ValueError("COSMOS_DB_ENDPOINT must be provided") + + # Try AAD first + try: + logger.info("Attempting to authenticate to Cosmos DB using DefaultAzureCredential (AAD)...") + credential = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=credential) + # Validate + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with DefaultAzureCredential.") + return client + except Exception as ex: + logger.warning(f"AAD authentication failed: {ex}") + + # Fallback to key + if key: + try: + logger.info("Falling back to key-based authentication for Cosmos DB...") + client = CosmosClient(endpoint, key) + # Validate + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with key.") + return client + except Exception as ex: + logger.error(f"Key authentication failed: {ex}") + + raise RuntimeError("Failed to authenticate to Cosmos DB") + +def upload_documents_to_search(): + """Read documents from Cosmos DB and upload directly to Azure AI Search.""" + + # Connect to Cosmos DB + cosmos_client = get_cosmos_client(COSMOS_ENDPOINT, COSMOS_KEY) + database = cosmos_client.get_database_client(DATABASE_NAME) + container = database.get_container_client(CONTAINER_NAME) + + # Get all documents from Cosmos DB + logger.info(f"Reading documents from Cosmos DB container: {CONTAINER_NAME}...") + query = "SELECT * FROM c" + items = list(container.query_items(query=query, enable_cross_partition_query=True)) + logger.info(f"Retrieved {len(items)} documents from Cosmos DB") + + if len(items) == 0: + logger.warning("No documents found in Cosmos DB container") + return + + # Connect to Search + if SEARCH_KEY: + search_credential = AzureKeyCredential(SEARCH_KEY) + else: + search_credential = DefaultAzureCredential() + + search_client = SearchClient(endpoint=SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=search_credential) + + # Prepare documents for upload + documents = [] + for item in items: + # Map Cosmos DB fields to Search index fields + doc = { + "id": str(item.get("id", item.get("ProductID"))), # Use Cosmos id or ProductID + "ProductID": str(item.get("ProductID")), + "ProductName": item.get("ProductName"), + "ProductCategory": item.get("ProductCategory"), + "ProductDescription": item.get("ProductDescription"), + "ProductPrice": float(item.get("Price", item.get("ProductPrice", 0.0))), + "ProductImageURL": item.get("ImageUrl", item.get("ProductImageURL", "")), + "content_for_vector": item.get("content_for_vector", "") + } + documents.append(doc) + + # Upload documents in batches + logger.info(f"Uploading {len(documents)} documents to Azure AI Search index: {INDEX_NAME}...") + try: + result = search_client.upload_documents(documents=documents) + success_count = sum(1 for r in result if r.succeeded) + failed_count = len(result) - success_count + + logger.info(f"Upload completed: {success_count} succeeded, {failed_count} failed") + + if failed_count > 0: + for r in result: + if not r.succeeded: + logger.error(f"Failed to upload document {r.key}: {r.error_message}") + + return success_count + except Exception as e: + logger.error(f"Failed to upload documents to search: {e}") + raise + +def main(): + logger.info("Starting data upload from Cosmos DB to Azure AI Search...") + count = upload_documents_to_search() + logger.info(f"Data upload completed successfully. {count} documents uploaded.") + +if __name__ == "__main__": + main() diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..8f79d8d --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,8 @@ +requests==2.32.3 +python-dotenv==1.0.1 +pandas>=2.2.2 +azure-cosmos==4.9.0 +azure-identity==1.19.0 +azure-search-documents==11.6.0 +openai==1.54.5 +azure-ai-inference==1.0.0b6 diff --git a/src/verify_data.py b/src/verify_data.py new file mode 100644 index 0000000..e30444e --- /dev/null +++ b/src/verify_data.py @@ -0,0 +1,24 @@ +from azure.cosmos import CosmosClient +from azure.identity import DefaultAzureCredential +import os +from dotenv import load_dotenv +import json + +load_dotenv() + +credential = DefaultAzureCredential() +client = CosmosClient(os.environ['COSMOS_DB_ENDPOINT'], credential) +db = client.get_database_client(os.environ['COSMOS_DB_NAME']) +container = db.get_container_client(os.environ['COSMOS_DB_CONTAINER_NAME']) + +# Count total items +count = list(container.query_items('SELECT VALUE COUNT(1) FROM c', enable_cross_partition_query=True))[0] +print(f'✓ Total items in Cosmos DB container: {count}') + +# Get sample products +items = list(container.query_items('SELECT TOP 3 c.ProductID, c.ProductName, c.ProductCategory, c.Price FROM c ORDER BY c.ProductID', enable_cross_partition_query=True)) +print(f'\n✓ Sample products:') +for item in items: + print(f" - {item['ProductID']}: {item['ProductName']} ({item['ProductCategory']}) - ${item['Price']}") + +print('\n✓ Data successfully loaded into Cosmos DB!') diff --git a/src/verify_search.py b/src/verify_search.py new file mode 100644 index 0000000..e11e63e --- /dev/null +++ b/src/verify_search.py @@ -0,0 +1,27 @@ +from azure.search.documents import SearchClient +from azure.core.credentials import AzureKeyCredential +from dotenv import load_dotenv +import os + +load_dotenv() + +credential = AzureKeyCredential(os.environ['SEARCH_SERVICE_KEY']) +client = SearchClient( + endpoint=os.environ['SEARCH_SERVICE_ENDPOINT'], + index_name=os.environ['SEARCH_INDEX_NAME'], + credential=credential +) + +# Count documents +results = client.search(search_text='*', include_total_count=True) +total_count = results.get_count() +print(f'✓ Total documents in Azure AI Search index: {total_count}') + +# Show sample products +print(f'\n✓ Sample products:') +for i, doc in enumerate(results): + print(f" - {doc['ProductID']}: {doc['ProductName']} ({doc['ProductCategory']}) - ${doc['ProductPrice']}") + if i >= 2: + break + +print('\n✓ Data successfully loaded into Azure AI Search!') diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf index ece0b39..652b2f0 100644 --- a/terraform-infrastructure/main.tf +++ b/terraform-infrastructure/main.tf @@ -15,19 +15,20 @@ resource "random_id" "suffix" { locals { # Use provided user_principal_id or default to current Azure CLI user - principal_id = var.user_principal_id != null ? var.user_principal_id : data.azurerm_client_config.current.object_id - suffix = substr(random_id.suffix.hex, 0, 8) - cosmos_account_name = "${var.name_prefix}${local.suffix}cosmosdb" - cosmos_db_name = "zava" - storage_account = lower(replace("${var.name_prefix}${local.suffix}sa", "-", "")) - ai_foundry_name = "aif-${local.suffix}" # custom subdomain - ai_project_name = "proj-${local.suffix}" - search_service_name = "${var.name_prefix}-${local.suffix}-search" - app_service_plan = "${var.name_prefix}-${local.suffix}-asp" - log_analytics_name = "${var.name_prefix}-${local.suffix}-la" - app_insights_name = "${var.name_prefix}-${local.suffix}-ai" - registry_name = lower(replace("${var.name_prefix}${local.suffix}cosureg", "-", "")) - web_app_name = "${var.name_prefix}-${local.suffix}-app" + principal_id = var.user_principal_id != null ? var.user_principal_id : data.azurerm_client_config.current.object_id + suffix = substr(random_id.suffix.hex, 0, 8) + cosmos_account_name = "${var.name_prefix}${local.suffix}cosmosdb" + cosmos_db_name = "zava" + storage_account = lower(replace("${var.name_prefix}${local.suffix}sa", "-", "")) + ai_foundry_name = "aif-${local.suffix}" # custom subdomain + ai_project_name = "proj-${local.suffix}" + search_service_name = "${var.name_prefix}-${local.suffix}-search" + app_service_plan = "${var.name_prefix}-${local.suffix}-asp" + log_analytics_name = "${var.name_prefix}-${local.suffix}-la" + app_insights_name = "${var.name_prefix}-${local.suffix}-ai" + registry_name = lower(replace("${var.name_prefix}${local.suffix}cosureg", "-", "")) + web_app_name = "${var.name_prefix}-${local.suffix}-app" + cosmos_connection_auth_type = var.enable_cosmos_local_auth ? "AccountKey" : "AAD" } resource "azurerm_cosmosdb_account" "cosmos" { @@ -45,9 +46,9 @@ resource "azurerm_cosmosdb_account" "cosmos" { location = var.location failover_priority = 0 } - free_tier_enabled = false - analytical_storage_enabled = false - local_authentication_disabled = !var.enable_cosmos_local_auth + free_tier_enabled = false + analytical_storage_enabled = false + local_authentication_disabled = !var.enable_cosmos_local_auth } resource "azurerm_cosmosdb_sql_database" "cosmosdb" { @@ -57,28 +58,37 @@ resource "azurerm_cosmosdb_sql_database" "cosmosdb" { throughput = 400 } +resource "azurerm_cosmosdb_sql_container" "products" { + name = "product_catalog" + resource_group_name = azurerm_resource_group.rg.name + account_name = azurerm_cosmosdb_account.cosmos.name + database_name = azurerm_cosmosdb_sql_database.cosmosdb.name + partition_key_paths = ["/ProductID"] + throughput = 400 +} + # Storage account using AzAPI to bypass policy restrictions resource "azapi_resource" "storage" { type = "Microsoft.Storage/storageAccounts@2023-01-01" name = local.storage_account location = var.location parent_id = azurerm_resource_group.rg.id - + body = jsonencode({ sku = { name = "Standard_LRS" } kind = "StorageV2" properties = { - accessTier = "Hot" - allowSharedKeyAccess = true + accessTier = "Hot" + allowSharedKeyAccess = true defaultToOAuthAuthentication = false - allowBlobPublicAccess = false - minimumTlsVersion = "TLS1_2" - supportsHttpsTrafficOnly = true + allowBlobPublicAccess = false + minimumTlsVersion = "TLS1_2" + supportsHttpsTrafficOnly = true } }) - + identity { type = "SystemAssigned" } @@ -86,10 +96,10 @@ resource "azapi_resource" "storage" { # AI Foundry account (preview) using AzAPI provider. resource "azapi_resource" "ai_foundry" { - type = "Microsoft.CognitiveServices/accounts@2025-06-01" - name = local.ai_foundry_name - location = var.location - parent_id = azurerm_resource_group.rg.id + type = "Microsoft.CognitiveServices/accounts@2025-06-01" + name = local.ai_foundry_name + location = var.location + parent_id = azurerm_resource_group.rg.id schema_validation_enabled = false identity { type = "SystemAssigned" } body = jsonencode({ @@ -104,13 +114,13 @@ resource "azapi_resource" "ai_foundry" { } resource "azapi_resource" "ai_project" { - type = "Microsoft.CognitiveServices/accounts/projects@2025-06-01" - name = local.ai_project_name - location = var.location - parent_id = azapi_resource.ai_foundry.id + type = "Microsoft.CognitiveServices/accounts/projects@2025-06-01" + name = local.ai_project_name + location = var.location + parent_id = azapi_resource.ai_foundry.id schema_validation_enabled = false identity { type = "SystemAssigned" } - body = jsonencode({ properties = {} }) + body = jsonencode({ properties = {} }) depends_on = [azapi_resource.ai_foundry] } @@ -157,7 +167,7 @@ resource "azurerm_container_registry_webhook" "webhook" { status = "enabled" scope = "${local.suffix}/techworkshopl300/zava:latest" actions = ["push"] - + custom_headers = { "Content-Type" = "application/json" } @@ -185,7 +195,7 @@ resource "azurerm_linux_web_app" "app" { docker_image_name = "${local.registry_name}.azurecr.io/${local.suffix}/techworkshopl300/zava:latest" docker_registry_url = "https://${local.registry_name}.azurecr.io" } - http2_enabled = true + http2_enabled = true minimum_tls_version = "1.2" } @@ -297,7 +307,7 @@ resource "azurerm_role_assignment" "storage_blob_data_contributor_project" { # Azure AI model deployments automation resource "null_resource" "ai_model_deployments" { count = var.enable_ai_automation ? 1 : 0 - + depends_on = [ azapi_resource.ai_project, azapi_resource.ai_foundry, @@ -305,7 +315,7 @@ resource "null_resource" "ai_model_deployments" { ] provisioner "local-exec" { - command = <<-EOT + command = <<-EOT # Create AI model deployments Write-Host "Creating Azure AI model deployments..." @@ -395,58 +405,221 @@ resource "null_resource" "ai_model_deployments" { } } -# Connect resources to Azure AI Foundry project -resource "null_resource" "ai_project_connections" { +# Connection helper actions for Foundry resources +data "azapi_resource_action" "storage_list_keys" { + count = var.enable_ai_automation ? 1 : 0 + type = "Microsoft.Storage/storageAccounts@2023-01-01" + resource_id = azapi_resource.storage.id + action = "listKeys" + response_export_values = ["keys"] + body = jsonencode({}) + depends_on = [azapi_resource.storage] +} + +data "azapi_resource_action" "search_admin_keys" { + count = var.enable_ai_automation ? 1 : 0 + type = "Microsoft.Search/searchServices@2025-02-01-preview" + resource_id = azurerm_search_service.search.id + action = "listAdminKeys" + response_export_values = ["primaryKey"] + body = jsonencode({}) + depends_on = [azurerm_search_service.search] +} + +data "azapi_resource_action" "cosmos_keys" { + count = (var.enable_ai_automation && var.enable_cosmos_local_auth) ? 1 : 0 + type = "Microsoft.DocumentDB/databaseAccounts@2024-11-15" + resource_id = azurerm_cosmosdb_account.cosmos.id + action = "listKeys" + response_export_values = ["primaryMasterKey"] + body = jsonencode({}) + depends_on = [azurerm_cosmosdb_account.cosmos] +} + +# Connect resources to Azure AI Foundry project using ARM templates +resource "azapi_resource" "storage_connection" { count = var.enable_ai_automation ? 1 : 0 - + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-storage" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azapi_resource.storage, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = { + category = "AzureStorageAccount" + target = "https://${local.storage_account}.blob.core.windows.net" + authType = "AccountKey" + isSharedToAll = true + credentials = { + key = jsondecode(data.azapi_resource_action.storage_list_keys[0].output).keys[0].value + } + metadata = { + ApiType = "Azure" + ResourceId = azapi_resource.storage.id + } + } + }) +} + +resource "azapi_resource" "app_insights_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-appinsights" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + depends_on = [ - null_resource.ai_model_deployments, azurerm_application_insights.appinsights, - azapi_resource.storage + azapi_resource.ai_foundry ] - provisioner "local-exec" { - command = <<-EOT - Write-Host "Verifying Azure AI Foundry project configuration..." - - # Check if Azure ML extension is installed - $mlExtension = az extension list --query "[?name=='ml'].name" --output tsv - if (-not $mlExtension) { - Write-Host "Installing Azure ML extension..." - az extension add --name ml + body = jsonencode({ + properties = { + category = "AppInsights" + target = azurerm_application_insights.appinsights.id + authType = "ApiKey" + isSharedToAll = true + credentials = { + key = azurerm_application_insights.appinsights.connection_string } - - # Set the AI project as the default workspace for future ML operations - az config set defaults.workspace="${local.ai_project_name}" - az config set defaults.group="${azurerm_resource_group.rg.name}" - - Write-Host "Azure AI project configuration completed successfully." - Write-Host "Project Name: ${local.ai_project_name}" + metadata = { + ApiType = "Azure" + ResourceId = azurerm_application_insights.appinsights.id + } + } + }) +} + +resource "azapi_resource" "search_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-aisearch" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azurerm_search_service.search, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = { + category = "CognitiveSearch" + target = "https://${local.search_service_name}.search.windows.net" + authType = "ApiKey" + isSharedToAll = true + credentials = { + key = jsondecode(data.azapi_resource_action.search_admin_keys[0].output).primaryKey + } + metadata = { + ApiType = "Azure" + ResourceId = azurerm_search_service.search.id + location = azurerm_search_service.search.location + } + } + }) +} + +resource "azapi_resource" "cosmos_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-cosmosdb" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azurerm_cosmosdb_account.cosmos, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = merge({ + category = "CosmosDb" + target = azurerm_cosmosdb_account.cosmos.endpoint + authType = local.cosmos_connection_auth_type + isSharedToAll = true + metadata = { + ApiType = "Azure" + ResourceId = azurerm_cosmosdb_account.cosmos.id + location = azurerm_cosmosdb_account.cosmos.location + } + }, var.enable_cosmos_local_auth ? { + credentials = { + key = jsondecode(data.azapi_resource_action.cosmos_keys[0].output).primaryMasterKey + } + } : {}) + }) +} + +# Verification script for connections +resource "null_resource" "verify_connections" { + count = var.enable_ai_automation ? 1 : 0 + + depends_on = [ + azapi_resource.storage_connection, + azapi_resource.app_insights_connection, + azapi_resource.search_connection, + azapi_resource.cosmos_connection + ] + + provisioner "local-exec" { + command = <<-EOT + Write-Host "=== Verifying Microsoft Foundry Project Connections ===" + Write-Host "" + Write-Host "Project: ${local.ai_project_name}" Write-Host "AI Foundry: ${local.ai_foundry_name}" Write-Host "Resource Group: ${azurerm_resource_group.rg.name}" + Write-Host "" + + # List connections using Azure CLI + Write-Host "Checking connections via Azure CLI..." + az rest --method GET --url "https://management.azure.com/subscriptions/${data.azurerm_client_config.current.subscription_id}/resourceGroups/${azurerm_resource_group.rg.name}/providers/Microsoft.CognitiveServices/accounts/${local.ai_foundry_name}/connections?api-version=2025-06-01" --query "value[].{Name:name,Type:properties.connectionType,Target:properties.target}" --output table + + Write-Host "" + Write-Host "✓ Microsoft Foundry project connections verification completed!" + Write-Host "" + Write-Host "Available connections:" + Write-Host " - Storage Account: ${local.storage_account}" + Write-Host " - Application Insights: ${local.app_insights_name}" + Write-Host " - Azure AI Search: ${local.search_service_name}" + Write-Host " - Cosmos DB: ${local.cosmos_account_name}" + Write-Host "" + Write-Host "View in Azure Portal:" + Write-Host " https://ai.azure.com/resource/overview/${local.ai_foundry_name}" + Write-Host " Navigate to Management center > Connected resources" EOT interpreter = ["PowerShell", "-Command"] } triggers = { - storage_id = azapi_resource.storage.id - app_insights_id = azurerm_application_insights.appinsights.id - ai_project_id = azapi_resource.ai_project.id + storage_conn = var.enable_ai_automation ? azapi_resource.storage_connection[0].id : "" + app_insights_conn = var.enable_ai_automation ? azapi_resource.app_insights_connection[0].id : "" + search_conn = var.enable_ai_automation ? azapi_resource.search_connection[0].id : "" + cosmos_conn = var.enable_ai_automation ? azapi_resource.cosmos_connection[0].id : "" } } # Create .env file with all necessary configuration resource "null_resource" "create_env_file" { count = var.enable_ai_automation ? 1 : 0 - + depends_on = [ - null_resource.ai_project_connections, + null_resource.verify_connections, azurerm_cosmosdb_account.cosmos, azurerm_search_service.search ] provisioner "local-exec" { - command = <<-EOT + command = <<-EOT Write-Host "Creating .env file with Azure resource configuration..." # Create src directory if it doesn't exist @@ -509,7 +682,9 @@ AZURE_OPENAI_API_VERSION=2024-02-01 COSMOS_DB_ENDPOINT=${azurerm_cosmosdb_account.cosmos.endpoint} COSMOS_DB_KEY=$cosmosKey COSMOS_DB_NAME=${local.cosmos_db_name} -COSMOS_DB_CONTAINER_NAME=products +COSMOS_DB_CONTAINER_NAME=product_catalog +COSMOS_SKIP_IF_EXISTS=true +COSMOS_FORCE_INGEST=false # Azure AI Search Configuration SEARCH_SERVICE_ENDPOINT=https://${local.search_service_name}.search.windows.net @@ -546,7 +721,9 @@ AZURE_OPENAI_API_VERSION=2024-02-01 COSMOS_DB_ENDPOINT=${azurerm_cosmosdb_account.cosmos.endpoint} COSMOS_DB_KEY=$cosmosKey COSMOS_DB_NAME=${local.cosmos_db_name} -COSMOS_DB_CONTAINER_NAME=products +COSMOS_DB_CONTAINER_NAME=product_catalog +COSMOS_SKIP_IF_EXISTS=true +COSMOS_FORCE_INGEST=false # Azure AI Search Configuration SEARCH_SERVICE_ENDPOINT=https://${local.search_service_name}.search.windows.net @@ -589,11 +766,118 @@ AZURE_LOCATION=${var.location} triggers = { # Trigger recreation when any of these resources change - ai_foundry_id = azapi_resource.ai_foundry.id - ai_project_id = azapi_resource.ai_project.id - cosmos_id = azurerm_cosmosdb_account.cosmos.id - search_id = azurerm_search_service.search.id - storage_id = azapi_resource.storage.id + ai_foundry_id = azapi_resource.ai_foundry.id + ai_project_id = azapi_resource.ai_project.id + cosmos_id = azurerm_cosmosdb_account.cosmos.id + search_id = azurerm_search_service.search.id + storage_id = azapi_resource.storage.id app_insights_id = azurerm_application_insights.appinsights.id } } + +# Data pipeline automation - runs after .env file is created +resource "null_resource" "data_pipeline" { + count = var.enable_data_pipeline ? 1 : 0 + + depends_on = [ + null_resource.create_env_file, + azurerm_cosmosdb_sql_database.cosmosdb, + azurerm_cosmosdb_sql_container.products + ] + + provisioner "local-exec" { + command = <<-EOT + Write-Host "Starting data pipeline automation..." + + # Navigate to src directory + cd ../src + + # Check if Python is available + try { + $pythonCmd = (Get-Command python -ErrorAction Stop).Source + Write-Host "Found Python at: $pythonCmd" + } catch { + Write-Host "ERROR: Python is not installed or not in PATH" + Write-Host "Please install Python 3.8+ from https://www.python.org/downloads/" + exit 1 + } + + # Create virtual environment + Write-Host "Creating Python virtual environment..." + if (Test-Path "venv") { + Write-Host "Virtual environment already exists, removing..." + Remove-Item -Recurse -Force venv + } + python -m venv venv + + # Install dependencies directly to venv without activation + Write-Host "Installing Python dependencies (with retry)..." + $pythonExe = "venv\Scripts\python.exe" + $pipExe = "venv\Scripts\pip.exe" + + if (Test-Path $pythonExe) { + & $pythonExe -m pip install --upgrade pip + $maxAttempts = 3 + for ($i = 1; $i -le $maxAttempts; $i++) { + Write-Host "pip install attempt $i..." + & $pipExe install -r requirements.txt + if ($LASTEXITCODE -eq 0) { + Write-Host "Dependencies installed successfully on attempt $i" + break + } else { + Write-Host "pip install failed (exit $LASTEXITCODE)." + if ($i -lt $maxAttempts) { + Write-Host "Retrying after short backoff..." + Start-Sleep -Seconds 5 + } else { + Write-Host "ERROR: Dependencies failed after $maxAttempts attempts" + exit 1 + } + } + } + + Write-Host "Python environment ready" + Write-Host "" + + # Check if CSV data file exists + $csvFile = "data/updated_product_catalog(in).csv" + if (!(Test-Path $csvFile)) { + Write-Host "WARNING: CSV data file not found at $csvFile" + Write-Host "Please download the product catalog data or place it in the data directory" + Write-Host "Skipping data import for now" + } else { + Write-Host "Step 1: Importing data to Cosmos DB (skip logic flags: COSMOS_SKIP_IF_EXISTS / COSMOS_FORCE_INGEST)..." + & $pythonExe pipelines/ingest_to_cosmos.py + + Write-Host "" + Write-Host "Step 2: Creating Azure AI Search index..." + & $pythonExe pipelines/create_search_index.py + + Write-Host "" + Write-Host "Step 3: Uploading data from Cosmos DB to Azure AI Search..." + & $pythonExe pipelines/upload_to_search.py + + Write-Host "" + Write-Host "Data pipeline completed successfully!" + Write-Host "- Cosmos DB container created and populated" + Write-Host "- Azure AI Search index created" + Write-Host "- Data imported to search index" + } + } else { + Write-Host "ERROR: Failed to create virtual environment" + exit 1 + } + + Write-Host "" + Write-Host "Data pipeline automation completed" + EOT + interpreter = ["PowerShell", "-Command"] + working_dir = path.module + } + + triggers = { + cosmos_db_id = azurerm_cosmosdb_sql_database.cosmosdb.id + search_id = azurerm_search_service.search.id + env_file_id = null_resource.create_env_file[0].id + } +} diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars index 0953c4e..2cc89f4 100644 --- a/terraform-infrastructure/terraform.tfvars +++ b/terraform-infrastructure/terraform.tfvars @@ -1,4 +1,4 @@ -resource_group_name = "RG-AI-retailw3" +resource_group_name = "RG-AI-retailbrw5" location = "westus3" name_prefix = "zava" # user_principal_id is optional - defaults to current Azure CLI user (az login) diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf index bda5d3d..f69a72e 100644 --- a/terraform-infrastructure/variables.tf +++ b/terraform-infrastructure/variables.tf @@ -32,3 +32,10 @@ variable "enable_ai_automation" { description = "Whether to run Azure AI Foundry automation steps (model deployments, connections, .env creation)" default = true } + +variable "enable_data_pipeline" { + type = bool + description = "Whether to run data pipeline automation (requires Python and data files)" + default = true +} + From 80fd6e505e439ba87fd2c291563086bdf7215b53 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 24 Nov 2025 03:32:01 +0000 Subject: [PATCH 02/14] Update last modified date in Markdown files --- TROUBLESHOOTING.md | 2 +- src/DATA_PIPELINE.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index d4305a8..7bc84a1 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-11-12 +Last updated: 2025-11-24 ---------- diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 8b50807..b1c3721 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-11-12 +Last updated: 2025-11-24 ---------- From cdd6d731ae9d97256faab21ea58d984d40cddfed Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:41:43 -0600 Subject: [PATCH 03/14] Revise and expand troubleshooting guide Updated troubleshooting guide with detailed sections on common issues related to Python environment, Azure authentication, Cosmos DB, data pipeline, Terraform, and general tips. Enhanced formatting and added new troubleshooting steps. --- TROUBLESHOOTING.md | 114 ++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 59 deletions(-) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 7bc84a1..4c15965 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -11,18 +11,33 @@ Last updated: 2025-11-24 > This guide covers common issues you may encounter when deploying and running this Azure AI Shopping demo application. -## Table of Contents -- [Python Environment Issues](#python-environment-issues) -- [Azure Authentication Issues](#azure-authentication-issues) -- [Cosmos DB Issues](#cosmos-db-issues) -- [Data Pipeline Issues](#data-pipeline-issues) -- [Terraform Issues](#terraform-issues) +
+Table of Content (Click to expand) + +- [Python Not Found](#python-not-found) +- [Virtual Environment Creation Failed](#virtual-environment-creation-failed) +- [Package Installation Failed](#package-installation-failed) +- [Not Logged into Azure CLI](#not-logged-into-azure-cli) +- [AAD Authentication Failed](#aad-authentication-failed) +- [Local Authorization Disabled Error](#local-authorization-disabled-error) +- [Connection Timeout](#connection-timeout) +- [CSV File Not Found](#csv-file-not-found) +- [CSV Parsing Error](#csv-parsing-error) +- [Environment File Missing](#environment-file-missing) +- [Failed to Authenticate to Cosmos DB](#failed-to-authenticate-to-cosmos-db) +- [Resource Already Exists](#resource-already-exists) +- [Insufficient Permissions](#insufficient-permissions) +- [Provider Configuration Error](#provider-configuration-error) +- [State Lock Error](#state-lock-error) +- [Enable Verbose Logging](#enable-verbose-logging) +- [Check Azure Service Health](#check-azure-service-health) +- [Clean Up and Retry](#clean-up-and-retry) +- [Still Having Issues?](#still-having-issues) + +
+ +## Python Not Found ---- - -## Python Environment Issues - -### Python Not Found ``` ERROR: Python is not installed or not in PATH ``` @@ -32,7 +47,7 @@ ERROR: Python is not installed or not in PATH - Ensure Python is added to your system PATH during installation - Verify installation: `python --version` -### Virtual Environment Creation Failed +## Virtual Environment Creation Failed ``` ERROR: Failed to create virtual environment ``` @@ -43,7 +58,7 @@ ERROR: Failed to create virtual environment - Check if `python -m venv` works manually: `python -m venv test_venv` - On Windows, ensure your execution policy allows script execution -### Package Installation Failed +## Package Installation Failed ``` ERROR: Could not install packages due to an OSError ``` @@ -54,11 +69,8 @@ ERROR: Could not install packages due to an OSError - Try installing with `--no-cache-dir`: `pip install --no-cache-dir -r requirements.txt` - For Windows + pandas issues, use pre-built wheels by ensuring `pandas>=2.2.2` in requirements.txt ---- -## Azure Authentication Issues - -### Not Logged into Azure CLI +## Not Logged into Azure CLI ``` ERROR: Please run 'az login' to setup account ``` @@ -75,7 +87,7 @@ az account show az account set --subscription ``` -### AAD Authentication Failed +## AAD Authentication Failed ``` DefaultAzureCredential failed to retrieve a token ``` @@ -86,20 +98,17 @@ DefaultAzureCredential failed to retrieve a token 3. Verify the resource exists and you have access 4. Try clearing Azure credentials cache: `az account clear` then `az login` again ---- - -## Cosmos DB Issues -### Local Authorization Disabled Error +## Local Authorization Disabled Error ``` ERROR: Local Authorization is disabled. Use an AAD token to authorize all requests. ``` -This error occurs when Cosmos DB requires Azure Active Directory (AAD) authentication instead of key-based authentication. +> This error occurs when Cosmos DB requires Azure Active Directory (AAD) authentication instead of key-based authentication. **Common Causes and Solutions**: -#### 1. Not logged into Azure CLI +- Not logged into Azure CLI ```powershell # Login to Azure CLI @@ -112,11 +121,11 @@ az account show az account set --subscription ``` -After logging in, try running the script again. +> After logging in, try running the script again. -#### 2. Public Network Access Disabled +- Public Network Access Disabled -If your Cosmos DB has public network access disabled, your local machine or Codespace VM cannot connect. +> If your Cosmos DB has public network access disabled, your local machine or Codespace VM cannot connect. **Solution via Azure Portal**: - Navigate to your Cosmos DB account in the Azure portal @@ -134,9 +143,7 @@ az cosmosdb update \ --enable-public-network true ``` -#### 3. Insufficient Permissions - -Your Azure account needs appropriate role assignments on the Cosmos DB account. +- Insufficient Permissions: Your Azure account needs appropriate role assignments on the Cosmos DB account. **Required roles**: - `Cosmos DB Built-in Data Contributor` (for read/write access) @@ -156,7 +163,7 @@ az cosmosdb sql role assignment create \ --scope "/" ``` -### Connection Timeout +## Connection Timeout ``` ERROR: Request timeout ``` @@ -167,11 +174,8 @@ ERROR: Request timeout - Ensure public network access is enabled (see above) - Check if Azure services are experiencing outages: https://status.azure.com/ ---- - -## Data Pipeline Issues -### CSV File Not Found +## CSV File Not Found ``` WARNING: CSV data file not found at data/updated_product_catalog(in).csv ``` @@ -183,7 +187,7 @@ Download or place the product catalog CSV file in the `src/data/` directory: curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/main/src/data/updated_product_catalog(in).csv ``` -### CSV Parsing Error +## CSV Parsing Error ``` ERROR: Error tokenizing data. C error: Expected X fields, saw Y ``` @@ -194,7 +198,7 @@ ERROR: Error tokenizing data. C error: Expected X fields, saw Y - Verify the CSV has the correct number of columns (6): ProductID, ProductName, ProductCategory, ProductDescription, Price, ImageUrl - Try opening the CSV in a text editor to check for formatting issues -### Environment File Missing +## Environment File Missing ``` ERROR: .env file not found ``` @@ -206,7 +210,7 @@ cd terraform-infrastructure terraform apply -auto-approve ``` -### Failed to Authenticate to Cosmos DB +## Failed to Authenticate to Cosmos DB ``` ERROR: Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided ``` @@ -217,11 +221,7 @@ ERROR: Failed to authenticate to Cosmos DB using DefaultAzureCredential and no v - Check that `COSMOS_DB_ENDPOINT` and `COSMOS_DB_KEY` are set correctly in `.env` - The script will automatically try AAD authentication first, then fall back to key-based auth ---- - -## Terraform Issues - -### Resource Already Exists +## Resource Already Exists ``` ERROR: A resource with the ID already exists ``` @@ -231,7 +231,8 @@ ERROR: A resource with the ID already exists - Or destroy and recreate: `terraform destroy` then `terraform apply` - Check for resources in other resource groups with the same name -### Insufficient Permissions +## Insufficient Permissions + ``` ERROR: The client does not have authorization to perform action ``` @@ -241,7 +242,8 @@ ERROR: The client does not have authorization to perform action - Check if specific Azure policies are blocking resource creation - Contact your Azure administrator to grant necessary permissions -### Provider Configuration Error +## Provider Configuration Error + ``` ERROR: Error configuring the backend "azurerm" ``` @@ -251,7 +253,8 @@ ERROR: Error configuring the backend "azurerm" - Check that the specified subscription exists and you have access - Ensure the backend storage account and container exist (if using remote state) -### State Lock Error +## State Lock Error + ``` ERROR: Error acquiring the state lock ``` @@ -262,13 +265,9 @@ ERROR: Error acquiring the state lock terraform force-unlock ``` -Only force-unlock if you're certain no other Terraform process is running. +> Only force-unlock if you're certain no other Terraform process is running. ---- - -## General Tips - -### Enable Verbose Logging +## Enable Verbose Logging For more detailed error information: @@ -290,12 +289,11 @@ export TF_LOG=DEBUG terraform apply ``` -### Check Azure Service Health +## Check Azure Service Health -If experiencing unexpected issues, check Azure service status: -- https://status.azure.com/ +> If experiencing unexpected issues, check [Azure service status](https://status.azure.com/) -### Clean Up and Retry +## Clean Up and Retry > Sometimes a clean slate helps: @@ -311,8 +309,6 @@ terraform init terraform apply ``` ---- - ## Still Having Issues? > If you continue experiencing problems: @@ -328,4 +324,4 @@ terraform apply Total views

Refresh Date: 2025-11-12

- \ No newline at end of file + From 9ac1e98452610ea6ee9a87451cbc9d0f05065991 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 24 Nov 2025 03:42:06 +0000 Subject: [PATCH 04/14] Update visitor count --- TROUBLESHOOTING.md | 4 ++-- src/DATA_PIPELINE.md | 4 ++-- terraform-infrastructure/README.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 4c15965..4042589 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -321,7 +321,7 @@ terraform apply
- Total views -

Refresh Date: 2025-11-12

+ Total views +

Refresh Date: 2025-11-24

diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index b1c3721..18ddc81 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -259,7 +259,7 @@ az search index show-statistics \
- Total views -

Refresh Date: 2025-11-12

+ Total views +

Refresh Date: 2025-11-24

\ No newline at end of file diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md index cb38b65..39c766e 100644 --- a/terraform-infrastructure/README.md +++ b/terraform-infrastructure/README.md @@ -119,7 +119,7 @@ graph TD;
- Total views -

Refresh Date: 2025-11-22

+ Total views +

Refresh Date: 2025-11-24

From 0f18e42d2e2ad9c7a80f288333a03a0863eb7712 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:46:46 -0600 Subject: [PATCH 05/14] Update DATA_PIPELINE.md with recent changes Updated the last updated date and total views in the documentation. Adjusted environment variable table formatting for clarity. --- src/DATA_PIPELINE.md | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 18ddc81..7bc4f11 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-11-24 +Last updated: 2025-11-12 ---------- @@ -113,7 +113,6 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c ## Scripts -
pipelines/ingest_to_cosmos.py (Click to expand) @@ -142,7 +141,6 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c
-
pipelines/create_search_index.py (Click to expand) @@ -154,10 +152,8 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c
-
-
- pipelines/create_search_index.py (Click to expand) + pipelines/create_search_index.py (Click to expand) - Creates Azure AI Search index with vector search - Configures HNSW algorithm for vector search @@ -166,9 +162,8 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c
-
- pipelines/upload_to_search.py (Click to expand) + pipelines/upload_to_search.py (Click to expand) - Reads all documents from Cosmos DB container - Authenticates using AAD or key-based auth (auto-fallback) @@ -209,14 +204,14 @@ AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small ### Environment Variable Reference -| Variable | Default | Description | -|----------|---------|-------------| -| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | -| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | -| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | -| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | -| `COSMOS_DB_NAME` | - | Database name | -| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | +| Variable | Default | Description | +|----------------------------|---------|--------------------------------------------------------| +| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | +| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | +| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | +| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | +| `COSMOS_DB_NAME` | - | Database name | +| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | ## Verification @@ -259,7 +254,7 @@ az search index show-statistics \
- Total views -

Refresh Date: 2025-11-24

+ Total views +

Refresh Date: 2025-11-12

- \ No newline at end of file + From ef20cbff77588583b243e0c83f9721bc4127cb59 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 24 Nov 2025 03:46:59 +0000 Subject: [PATCH 06/14] Update last modified date in Markdown files --- src/DATA_PIPELINE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 7bc4f11..d304cd6 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -5,7 +5,7 @@ Costa Rica [![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) [brown9804](https://github.com/brown9804) -Last updated: 2025-11-12 +Last updated: 2025-11-24 ---------- From 4d85f7206e4b49c22a9faaf6155c6c14eef2b5b7 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:47:51 -0600 Subject: [PATCH 07/14] Fix formatting issues in DATA_PIPELINE.md --- src/DATA_PIPELINE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index d304cd6..ae6cb6a 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -23,8 +23,8 @@ Last updated: 2025-11-24
Prerequisites: (Click to expand) -- Python 3.8 or higher installed and available in PATH -- Product catalog CSV file at `src/data/updated_product_catalog(in).csv` (demo) +> - Python 3.8 or higher installed and available in PATH +> - Product catalog CSV file at `src/data/updated_product_catalog(in).csv` (demo)
From 1d7a9cfdbfc7c9188ff2a09ed2a0fe986ca70c30 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 24 Nov 2025 03:48:01 +0000 Subject: [PATCH 08/14] Update visitor count --- src/DATA_PIPELINE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index ae6cb6a..a955bd8 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -254,7 +254,7 @@ az search index show-statistics \
- Total views -

Refresh Date: 2025-11-12

+ Total views +

Refresh Date: 2025-11-24

From 9a0179854ae44f6dda45bd046a41f292b49feb92 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:50:32 -0600 Subject: [PATCH 09/14] Enhance DATA_PIPELINE.md with table of contents Updated the documentation for the data pipeline to include a table of contents and improved section headings. --- src/DATA_PIPELINE.md | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index a955bd8..9c4cc3e 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -11,6 +11,25 @@ Last updated: 2025-11-24 > This automation handles the complete data pipeline setup for the Azure AI Shopping application. + +
+Table of Content (Click to expand) + +- [Usage](#usage) +- [Data Files](#data-files) +- [Scripts](#scripts) +- [Troubleshooting](#troubleshooting) +- [Configuration](#configuration) +- [Environment Variable Reference](#environment-variable-reference) +- [Verification](#verification) +- [Check Cosmos DB](#check-cosmos-db) +- [Check Search Index](#check-search-index) +- [Query Search Index](#query-search-index) +- [Next Steps](#next-steps) + +
+ + > [!NOTE] > What It Does: > The data pipeline automation performs the following tasks: @@ -202,7 +221,7 @@ AZURE_OPENAI_API_KEY=... AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small ``` -### Environment Variable Reference +## Environment Variable Reference | Variable | Default | Description | |----------------------------|---------|--------------------------------------------------------| @@ -217,7 +236,7 @@ AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small > After running the pipeline, verify data was imported: -### Check Cosmos DB +## Check Cosmos DB ```powershell az cosmosdb sql container show \ --account-name \ @@ -226,7 +245,7 @@ az cosmosdb sql container show \ --resource-group ``` -### Check Search Index +## Check Search Index ```powershell az search index show \ --index-name products-index \ @@ -234,7 +253,7 @@ az search index show \ --resource-group ``` -### Query Search Index +## Query Search Index ```powershell az search index show-statistics \ --index-name products-index \ From b2eda3801d616120d81ee6da3fe88ce07221356e Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:52:47 -0600 Subject: [PATCH 10/14] Fix formatting in DATA_PIPELINE.md --- src/DATA_PIPELINE.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 9c4cc3e..12f059f 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -223,14 +223,14 @@ AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small ## Environment Variable Reference -| Variable | Default | Description | +| Variable | Default | Description | |----------------------------|---------|--------------------------------------------------------| -| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | -| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | -| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | -| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | -| `COSMOS_DB_NAME` | - | Database name | -| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | +| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | +| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | +| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | +| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | +| `COSMOS_DB_NAME` | - | Database name | +| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | ## Verification From 3f0d719d0d37c1617aea34369943b6aa43f12352 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:54:29 -0600 Subject: [PATCH 11/14] Update DATA_PIPELINE.md for clarity and formatting --- src/DATA_PIPELINE.md | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 12f059f..0cf0576 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -31,8 +31,7 @@ Last updated: 2025-11-24 > [!NOTE] -> What It Does: -> The data pipeline automation performs the following tasks: +> What It Does? The data pipeline automation performs the following tasks: > 1. **Creates Python Virtual Environment**: Sets up an isolated Python environment with all required dependencies > 2. **Imports Data to Cosmos DB**: Loads product catalog data from CSV into Cosmos DB container > 3. **Creates Azure AI Search Index**: Sets up a search index with vector search capabilities @@ -223,14 +222,14 @@ AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small ## Environment Variable Reference -| Variable | Default | Description | -|----------------------------|---------|--------------------------------------------------------| -| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | -| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | -| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | -| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | -| `COSMOS_DB_NAME` | - | Database name | -| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | +| Variable | Default | Description | +|----------------------------|---------|-------------------------------------------------------| +| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | +| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | +| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | +| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | +| `COSMOS_DB_NAME` | - | Database name | +| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | ## Verification From d09d550909eabb375c3d705fcb70480e6a8c98f5 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:57:13 -0600 Subject: [PATCH 12/14] Add MD060 rule to markdownlint configuration --- .github/.markdownlint.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/.markdownlint.json b/.github/.markdownlint.json index e9300d1..f4ae7c9 100644 --- a/.github/.markdownlint.json +++ b/.github/.markdownlint.json @@ -8,5 +8,8 @@ "MD033": false, "MD048": false, "MD040": false, - "MD041": false + "MD041": false, + "MD060": { + "style": "any" + } } From 680b1d15aa063a3db164e645efb567d5f8e65121 Mon Sep 17 00:00:00 2001 From: Timna Brown <24630902+brown9804@users.noreply.github.com> Date: Sun, 23 Nov 2025 21:58:28 -0600 Subject: [PATCH 13/14] Disable MD060 rule in markdownlint configuration --- .github/.markdownlint.json | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/.markdownlint.json b/.github/.markdownlint.json index f4ae7c9..0292480 100644 --- a/.github/.markdownlint.json +++ b/.github/.markdownlint.json @@ -9,7 +9,5 @@ "MD048": false, "MD040": false, "MD041": false, - "MD060": { - "style": "any" - } + "MD060": false } From 07cc7f97cb34c34869ee567f6051bad6c45de496 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 24 Nov 2025 03:58:43 +0000 Subject: [PATCH 14/14] Fix Markdown syntax issues --- TROUBLESHOOTING.md | 38 +++++++++++++++++++++++++++++++++----- src/DATA_PIPELINE.md | 17 +++++++++++------ 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md index 4042589..2b69924 100644 --- a/TROUBLESHOOTING.md +++ b/TROUBLESHOOTING.md @@ -43,39 +43,45 @@ ERROR: Python is not installed or not in PATH ``` **Solution**: -- Install Python 3.8+ from https://www.python.org/downloads/ + +- Install Python 3.8+ from - Ensure Python is added to your system PATH during installation - Verify installation: `python --version` ## Virtual Environment Creation Failed + ``` ERROR: Failed to create virtual environment ``` **Solution**: + - Ensure you have write permissions to the `src` directory - Try deleting existing `venv` folder: `Remove-Item -Recurse -Force venv` - Check if `python -m venv` works manually: `python -m venv test_venv` - On Windows, ensure your execution policy allows script execution ## Package Installation Failed + ``` ERROR: Could not install packages due to an OSError ``` **Solution**: + - Update pip: `python -m pip install --upgrade pip` - Clear pip cache: `pip cache purge` - Try installing with `--no-cache-dir`: `pip install --no-cache-dir -r requirements.txt` - For Windows + pandas issues, use pre-built wheels by ensuring `pandas>=2.2.2` in requirements.txt - ## Not Logged into Azure CLI + ``` ERROR: Please run 'az login' to setup account ``` **Solution**: + ```powershell # Login to Azure CLI az login @@ -88,18 +94,20 @@ az account set --subscription ``` ## AAD Authentication Failed + ``` DefaultAzureCredential failed to retrieve a token ``` **Solution**: + 1. Ensure you're logged into Azure CLI: `az login` 2. Check your account has proper permissions 3. Verify the resource exists and you have access 4. Try clearing Azure credentials cache: `az account clear` then `az login` again - ## Local Authorization Disabled Error + ``` ERROR: Local Authorization is disabled. Use an AAD token to authorize all requests. ``` @@ -128,6 +136,7 @@ az account set --subscription > If your Cosmos DB has public network access disabled, your local machine or Codespace VM cannot connect. **Solution via Azure Portal**: + - Navigate to your Cosmos DB account in the Azure portal - Select **Networking** from the Settings menu - Ensure **Public network access** is set to **All networks** @@ -136,6 +145,7 @@ az account set --subscription - Try running the script again **Solution via Azure CLI**: + ```powershell az cosmosdb update \ --name \ @@ -146,10 +156,12 @@ az cosmosdb update \ - Insufficient Permissions: Your Azure account needs appropriate role assignments on the Cosmos DB account. **Required roles**: + - `Cosmos DB Built-in Data Contributor` (for read/write access) - Or `Contributor` at the resource group level **Solution via Azure CLI**: + ```powershell # Get your user object ID $userId = (az ad signed-in-user show --query id -o tsv) @@ -164,18 +176,20 @@ az cosmosdb sql role assignment create \ ``` ## Connection Timeout + ``` ERROR: Request timeout ``` **Solution**: + - Check your network connection - Verify Cosmos DB firewall settings allow your IP address - Ensure public network access is enabled (see above) -- Check if Azure services are experiencing outages: https://status.azure.com/ - +- Check if Azure services are experiencing outages: ## CSV File Not Found + ``` WARNING: CSV data file not found at data/updated_product_catalog(in).csv ``` @@ -188,22 +202,26 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c ``` ## CSV Parsing Error + ``` ERROR: Error tokenizing data. C error: Expected X fields, saw Y ``` **Solution**: + - Ensure CSV fields with commas are properly quoted - Check for special characters or encoding issues - Verify the CSV has the correct number of columns (6): ProductID, ProductName, ProductCategory, ProductDescription, Price, ImageUrl - Try opening the CSV in a text editor to check for formatting issues ## Environment File Missing + ``` ERROR: .env file not found ``` **Solution**: + ```bash # Run Terraform to generate the .env file cd terraform-infrastructure @@ -211,22 +229,26 @@ terraform apply -auto-approve ``` ## Failed to Authenticate to Cosmos DB + ``` ERROR: Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided ``` **Solution**: + - Ensure your `.env` file is properly generated with correct keys - Run `terraform apply` again if needed - Check that `COSMOS_DB_ENDPOINT` and `COSMOS_DB_KEY` are set correctly in `.env` - The script will automatically try AAD authentication first, then fall back to key-based auth ## Resource Already Exists + ``` ERROR: A resource with the ID already exists ``` **Solution**: + - Import the existing resource: `terraform import . ` - Or destroy and recreate: `terraform destroy` then `terraform apply` - Check for resources in other resource groups with the same name @@ -238,6 +260,7 @@ ERROR: The client does not have authorization to perform action ``` **Solution**: + - Ensure your Azure account has `Contributor` or `Owner` role on the subscription or resource group - Check if specific Azure policies are blocking resource creation - Contact your Azure administrator to grant necessary permissions @@ -249,6 +272,7 @@ ERROR: Error configuring the backend "azurerm" ``` **Solution**: + - Verify your Azure credentials are configured: `az login` - Check that the specified subscription exists and you have access - Ensure the backend storage account and container exist (if using remote state) @@ -260,6 +284,7 @@ ERROR: Error acquiring the state lock ``` **Solution**: + ```bash # Force unlock (use with caution) terraform force-unlock @@ -272,18 +297,21 @@ terraform force-unlock For more detailed error information: **Azure CLI**: + ```powershell az --debug ``` **Python Scripts**: Set environment variable before running: + ```powershell $env:AZURE_LOG_LEVEL = "DEBUG" python pipelines/script.py ``` **Terraform**: + ```bash export TF_LOG=DEBUG terraform apply diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md index 0cf0576..69a2ff8 100644 --- a/src/DATA_PIPELINE.md +++ b/src/DATA_PIPELINE.md @@ -11,7 +11,6 @@ Last updated: 2025-11-24 > This automation handles the complete data pipeline setup for the Azure AI Shopping application. -
Table of Content (Click to expand) @@ -29,15 +28,14 @@ Last updated: 2025-11-24
- > [!NOTE] > What It Does? The data pipeline automation performs the following tasks: +> > 1. **Creates Python Virtual Environment**: Sets up an isolated Python environment with all required dependencies > 2. **Imports Data to Cosmos DB**: Loads product catalog data from CSV into Cosmos DB container > 3. **Creates Azure AI Search Index**: Sets up a search index with vector search capabilities > 4. **Imports Data to Search**: Populates the search index from Cosmos DB using an indexer -
Prerequisites: (Click to expand) @@ -68,6 +66,7 @@ terraform apply -auto-approve ``` This will: + - Deploy all Azure resources - Create AI model deployments - Generate `.env` file @@ -76,17 +75,20 @@ This will: > Option 2: Run Manually → If you prefer to run the data pipeline manually or separately: 1. **Ensure `.env` file exists** (created by Terraform): + ```bash cd terraform-infrastructure terraform apply -auto-approve ``` 2. **Navigate to src directory**: + ```bash cd ../src ``` 3. **Create virtual environment and install dependencies**: + ```powershell python -m venv venv .\venv\Scripts\Activate.ps1 @@ -95,6 +97,7 @@ This will: ``` 4. **Run pipeline scripts in order**: + ```powershell # Step 1: Import data to Cosmos DB python pipelines/ingest_to_cosmos.py @@ -115,6 +118,7 @@ src/data/updated_product_catalog(in).csv ``` > Expected columns: + - `ProductID`: Unique product identifier - `ProductName`: Product name - `ProductCategory`: Product category @@ -147,7 +151,6 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c
-
pipelines/create_search_index.py (Click to expand) @@ -158,7 +161,6 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c
-
pipelines/create_search_index.py (Click to expand) @@ -196,7 +198,7 @@ curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.c > For detailed troubleshooting guidance, see [TROUBLESHOOTING.md](../TROUBLESHOOTING.md). Quick Reference: -- **Python Not Found**: Install Python 3.8+ from https://www.python.org/downloads/ +- **Python Not Found**: Install Python 3.8+ from - **CSV File Not Found**: Download the product catalog CSV file and place it in `src/data/` directory - **Authentication Errors**: Run `az login` and ensure you have proper permissions. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#azure-authentication-issues) for detailed solutions. - **Virtual Environment Issues**: Delete `venv` folder and recreate. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#python-environment-issues) for details. @@ -236,6 +238,7 @@ AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small > After running the pipeline, verify data was imported: ## Check Cosmos DB + ```powershell az cosmosdb sql container show \ --account-name \ @@ -245,6 +248,7 @@ az cosmosdb sql container show \ ``` ## Check Search Index + ```powershell az search index show \ --index-name products-index \ @@ -253,6 +257,7 @@ az search index show \ ``` ## Query Search Index + ```powershell az search index show-statistics \ --index-name products-index \