diff --git a/.github/.markdownlint.json b/.github/.markdownlint.json index e9300d1..0292480 100644 --- a/.github/.markdownlint.json +++ b/.github/.markdownlint.json @@ -8,5 +8,6 @@ "MD033": false, "MD048": false, "MD040": false, - "MD041": false + "MD041": false, + "MD060": false } diff --git a/.gitignore b/.gitignore index 6349e36..c95816f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # Local .terraform directories .terraform/ +*.terraform.lock.hcl +.terraform.lock.hcl +*src/.env # .tfstate files *.tfstate diff --git a/TROUBLESHOOTING.md b/TROUBLESHOOTING.md new file mode 100644 index 0000000..2b69924 --- /dev/null +++ b/TROUBLESHOOTING.md @@ -0,0 +1,355 @@ +# Troubleshooting Guide - Overview + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-11-24 + +---------- + +> This guide covers common issues you may encounter when deploying and running this Azure AI Shopping demo application. + +
+Table of Content (Click to expand) + +- [Python Not Found](#python-not-found) +- [Virtual Environment Creation Failed](#virtual-environment-creation-failed) +- [Package Installation Failed](#package-installation-failed) +- [Not Logged into Azure CLI](#not-logged-into-azure-cli) +- [AAD Authentication Failed](#aad-authentication-failed) +- [Local Authorization Disabled Error](#local-authorization-disabled-error) +- [Connection Timeout](#connection-timeout) +- [CSV File Not Found](#csv-file-not-found) +- [CSV Parsing Error](#csv-parsing-error) +- [Environment File Missing](#environment-file-missing) +- [Failed to Authenticate to Cosmos DB](#failed-to-authenticate-to-cosmos-db) +- [Resource Already Exists](#resource-already-exists) +- [Insufficient Permissions](#insufficient-permissions) +- [Provider Configuration Error](#provider-configuration-error) +- [State Lock Error](#state-lock-error) +- [Enable Verbose Logging](#enable-verbose-logging) +- [Check Azure Service Health](#check-azure-service-health) +- [Clean Up and Retry](#clean-up-and-retry) +- [Still Having Issues?](#still-having-issues) + +
+ +## Python Not Found + +``` +ERROR: Python is not installed or not in PATH +``` + +**Solution**: + +- Install Python 3.8+ from +- Ensure Python is added to your system PATH during installation +- Verify installation: `python --version` + +## Virtual Environment Creation Failed + +``` +ERROR: Failed to create virtual environment +``` + +**Solution**: + +- Ensure you have write permissions to the `src` directory +- Try deleting existing `venv` folder: `Remove-Item -Recurse -Force venv` +- Check if `python -m venv` works manually: `python -m venv test_venv` +- On Windows, ensure your execution policy allows script execution + +## Package Installation Failed + +``` +ERROR: Could not install packages due to an OSError +``` + +**Solution**: + +- Update pip: `python -m pip install --upgrade pip` +- Clear pip cache: `pip cache purge` +- Try installing with `--no-cache-dir`: `pip install --no-cache-dir -r requirements.txt` +- For Windows + pandas issues, use pre-built wheels by ensuring `pandas>=2.2.2` in requirements.txt + +## Not Logged into Azure CLI + +``` +ERROR: Please run 'az login' to setup account +``` + +**Solution**: + +```powershell +# Login to Azure CLI +az login + +# Verify you're logged in with the correct account +az account show + +# If needed, set the correct subscription +az account set --subscription +``` + +## AAD Authentication Failed + +``` +DefaultAzureCredential failed to retrieve a token +``` + +**Solution**: + +1. Ensure you're logged into Azure CLI: `az login` +2. Check your account has proper permissions +3. Verify the resource exists and you have access +4. Try clearing Azure credentials cache: `az account clear` then `az login` again + +## Local Authorization Disabled Error + +``` +ERROR: Local Authorization is disabled. Use an AAD token to authorize all requests. +``` + +> This error occurs when Cosmos DB requires Azure Active Directory (AAD) authentication instead of key-based authentication. + +**Common Causes and Solutions**: + +- Not logged into Azure CLI + +```powershell +# Login to Azure CLI +az login + +# Verify you're logged in with the correct account +az account show + +# If needed, set the correct subscription +az account set --subscription +``` + +> After logging in, try running the script again. + +- Public Network Access Disabled + +> If your Cosmos DB has public network access disabled, your local machine or Codespace VM cannot connect. + +**Solution via Azure Portal**: + +- Navigate to your Cosmos DB account in the Azure portal +- Select **Networking** from the Settings menu +- Ensure **Public network access** is set to **All networks** +- Click **Save** +- Wait a few minutes for the change to propagate +- Try running the script again + +**Solution via Azure CLI**: + +```powershell +az cosmosdb update \ + --name \ + --resource-group \ + --enable-public-network true +``` + +- Insufficient Permissions: Your Azure account needs appropriate role assignments on the Cosmos DB account. + +**Required roles**: + +- `Cosmos DB Built-in Data Contributor` (for read/write access) +- Or `Contributor` at the resource group level + +**Solution via Azure CLI**: + +```powershell +# Get your user object ID +$userId = (az ad signed-in-user show --query id -o tsv) + +# Assign Cosmos DB Data Contributor role +az cosmosdb sql role assignment create \ + --account-name \ + --resource-group \ + --role-definition-id 00000000-0000-0000-0000-000000000002 \ + --principal-id $userId \ + --scope "/" +``` + +## Connection Timeout + +``` +ERROR: Request timeout +``` + +**Solution**: + +- Check your network connection +- Verify Cosmos DB firewall settings allow your IP address +- Ensure public network access is enabled (see above) +- Check if Azure services are experiencing outages: + +## CSV File Not Found + +``` +WARNING: CSV data file not found at data/updated_product_catalog(in).csv +``` + +**Solution**: +Download or place the product catalog CSV file in the `src/data/` directory: + +```bash +curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/main/src/data/updated_product_catalog(in).csv +``` + +## CSV Parsing Error + +``` +ERROR: Error tokenizing data. C error: Expected X fields, saw Y +``` + +**Solution**: + +- Ensure CSV fields with commas are properly quoted +- Check for special characters or encoding issues +- Verify the CSV has the correct number of columns (6): ProductID, ProductName, ProductCategory, ProductDescription, Price, ImageUrl +- Try opening the CSV in a text editor to check for formatting issues + +## Environment File Missing + +``` +ERROR: .env file not found +``` + +**Solution**: + +```bash +# Run Terraform to generate the .env file +cd terraform-infrastructure +terraform apply -auto-approve +``` + +## Failed to Authenticate to Cosmos DB + +``` +ERROR: Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided +``` + +**Solution**: + +- Ensure your `.env` file is properly generated with correct keys +- Run `terraform apply` again if needed +- Check that `COSMOS_DB_ENDPOINT` and `COSMOS_DB_KEY` are set correctly in `.env` +- The script will automatically try AAD authentication first, then fall back to key-based auth + +## Resource Already Exists + +``` +ERROR: A resource with the ID already exists +``` + +**Solution**: + +- Import the existing resource: `terraform import . ` +- Or destroy and recreate: `terraform destroy` then `terraform apply` +- Check for resources in other resource groups with the same name + +## Insufficient Permissions + +``` +ERROR: The client does not have authorization to perform action +``` + +**Solution**: + +- Ensure your Azure account has `Contributor` or `Owner` role on the subscription or resource group +- Check if specific Azure policies are blocking resource creation +- Contact your Azure administrator to grant necessary permissions + +## Provider Configuration Error + +``` +ERROR: Error configuring the backend "azurerm" +``` + +**Solution**: + +- Verify your Azure credentials are configured: `az login` +- Check that the specified subscription exists and you have access +- Ensure the backend storage account and container exist (if using remote state) + +## State Lock Error + +``` +ERROR: Error acquiring the state lock +``` + +**Solution**: + +```bash +# Force unlock (use with caution) +terraform force-unlock +``` + +> Only force-unlock if you're certain no other Terraform process is running. + +## Enable Verbose Logging + +For more detailed error information: + +**Azure CLI**: + +```powershell +az --debug +``` + +**Python Scripts**: +Set environment variable before running: + +```powershell +$env:AZURE_LOG_LEVEL = "DEBUG" +python pipelines/script.py +``` + +**Terraform**: + +```bash +export TF_LOG=DEBUG +terraform apply +``` + +## Check Azure Service Health + +> If experiencing unexpected issues, check [Azure service status](https://status.azure.com/) + +## Clean Up and Retry + +> Sometimes a clean slate helps: + +```bash +# Clean Python environment +Remove-Item -Recurse -Force venv +python -m venv venv + +# Clean Terraform state (use with caution) +terraform destroy +Remove-Item -Recurse -Force .terraform +terraform init +terraform apply +``` + +## Still Having Issues? + +> If you continue experiencing problems: + +1. Check the [GitHub repository issues](https://github.com/MicrosoftCloudEssentials-LearningHub/Agentic-DevOps-AI-Shopping/issues) +2. Review Azure documentation for specific services +3. Enable detailed logging as described above +4. Collect error messages, logs, and configuration details +5. Create a new issue with detailed information about your problem + + +
+ Total views +

Refresh Date: 2025-11-24

+
+ diff --git a/src/DATA_PIPELINE.md b/src/DATA_PIPELINE.md new file mode 100644 index 0000000..69a2ff8 --- /dev/null +++ b/src/DATA_PIPELINE.md @@ -0,0 +1,283 @@ +# Data Pipeline Automation - Overview + +Costa Rica + +[![GitHub](https://img.shields.io/badge/--181717?logo=github&logoColor=ffffff)](https://github.com/) +[brown9804](https://github.com/brown9804) + +Last updated: 2025-11-24 + +---------- + +> This automation handles the complete data pipeline setup for the Azure AI Shopping application. + +
+Table of Content (Click to expand) + +- [Usage](#usage) +- [Data Files](#data-files) +- [Scripts](#scripts) +- [Troubleshooting](#troubleshooting) +- [Configuration](#configuration) +- [Environment Variable Reference](#environment-variable-reference) +- [Verification](#verification) +- [Check Cosmos DB](#check-cosmos-db) +- [Check Search Index](#check-search-index) +- [Query Search Index](#query-search-index) +- [Next Steps](#next-steps) + +
+ +> [!NOTE] +> What It Does? The data pipeline automation performs the following tasks: +> +> 1. **Creates Python Virtual Environment**: Sets up an isolated Python environment with all required dependencies +> 2. **Imports Data to Cosmos DB**: Loads product catalog data from CSV into Cosmos DB container +> 3. **Creates Azure AI Search Index**: Sets up a search index with vector search capabilities +> 4. **Imports Data to Search**: Populates the search index from Cosmos DB using an indexer + +
+ Prerequisites: (Click to expand) + +> - Python 3.8 or higher installed and available in PATH +> - Product catalog CSV file at `src/data/updated_product_catalog(in).csv` (demo) + +
+ +> Automated by Terraform: + +- Cosmos DB account and database +- Azure AI Search service +- Azure OpenAI model deployments +- Environment variables in `src/.env` + +## Usage + +> Option 1: Run Automatically with Terraform → Enable data pipeline automation in `terraform.tfvars`: + +```hcl +enable_data_pipeline = true +``` + +Then run: + +```bash +terraform apply -auto-approve +``` + +This will: + +- Deploy all Azure resources +- Create AI model deployments +- Generate `.env` file +- **Automatically run the complete data pipeline** + +> Option 2: Run Manually → If you prefer to run the data pipeline manually or separately: + +1. **Ensure `.env` file exists** (created by Terraform): + + ```bash + cd terraform-infrastructure + terraform apply -auto-approve + ``` + +2. **Navigate to src directory**: + + ```bash + cd ../src + ``` + +3. **Create virtual environment and install dependencies**: + + ```powershell + python -m venv venv + .\venv\Scripts\Activate.ps1 + pip install --upgrade pip + pip install -r requirements.txt + ``` + +4. **Run pipeline scripts in order**: + + ```powershell + # Step 1: Import data to Cosmos DB + python pipelines/ingest_to_cosmos.py + + # Step 2: Create Azure AI Search index + python pipelines/create_search_index.py + + # Step 3: Upload data to search index + python pipelines/upload_to_search.py + ``` + +## Data Files + +> Product Catalog CSV → The product catalog data should be placed at: + +``` +src/data/updated_product_catalog(in).csv +``` + +> Expected columns: + +- `ProductID`: Unique product identifier +- `ProductName`: Product name +- `ProductCategory`: Product category +- `ProductDescription`: Product description +- `ProductPrice`: Product price +- `ProductImageURL`: URL to product image + +> Download Data → If you don't have the data file, you can download it from the reference repository [TechWorkshop-L300-AI-Apps-and-agents](https://github.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/tree/main), please feel free to follow the guide as well [Guide - TechWorkshop L300: AI Apps and Agents](https://microsoft.github.io/TechWorkshop-L300-AI-Apps-and-agents/): + +```bash +# Download the product catalog data +curl -o src/data/updated_product_catalog(in).csv https://raw.githubusercontent.com/microsoft/TechWorkshop-L300-AI-Apps-and-agents/main/src/data/updated_product_catalog(in).csv +``` + +## Scripts + +
+ pipelines/ingest_to_cosmos.py (Click to expand) + +- Reads CSV data with product catalog +- Connects to Cosmos DB (uses AAD or key-based auth) +- Creates database and container if they don't exist +- Imports all products with upsert operations +- Creates `content_for_vector` field for semantic search +- **Smart Skip Logic**: + - By default (`COSMOS_SKIP_IF_EXISTS=true`), checks if container already has data + - If data exists, skips import to avoid duplicates and save time + - Set `COSMOS_FORCE_INGEST=true` to force re-import even if data exists + - Set `COSMOS_SKIP_IF_EXISTS=false` to always import (legacy behavior) + +
+ +
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search +- Configures HNSW algorithm for vector search +- Sets up Azure OpenAI vectorizer +- Defines searchable and filterable fields + +
+ +
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search capabilities +- Configures HNSW algorithm for efficient vector similarity search +- Sets up Azure OpenAI vectorizer with text-embedding-3-small model +- Defines searchable, filterable, and vector fields +- Supports hybrid search (keyword + semantic) + +
+ +
+ pipelines/create_search_index.py (Click to expand) + +- Creates Azure AI Search index with vector search +- Configures HNSW algorithm for vector search +- Sets up Azure OpenAI vectorizer +- Defines searchable and filterable fields + +
+ +
+ pipelines/upload_to_search.py (Click to expand) + +- Reads all documents from Cosmos DB container +- Authenticates using AAD or key-based auth (auto-fallback) +- Maps Cosmos DB fields to Azure AI Search index schema +- Uploads documents in batches to Azure AI Search +- Provides detailed success/failure reporting +- **Note**: This script replaces the traditional indexer approach to avoid managed identity complexity when Cosmos DB local auth is disabled + +
+ +## Troubleshooting + +> For detailed troubleshooting guidance, see [TROUBLESHOOTING.md](../TROUBLESHOOTING.md). Quick Reference: + +- **Python Not Found**: Install Python 3.8+ from +- **CSV File Not Found**: Download the product catalog CSV file and place it in `src/data/` directory +- **Authentication Errors**: Run `az login` and ensure you have proper permissions. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#azure-authentication-issues) for detailed solutions. +- **Virtual Environment Issues**: Delete `venv` folder and recreate. See [TROUBLESHOOTING.md](../TROUBLESHOOTING.md#python-environment-issues) for details. + +## Configuration + +> All configuration is pulled from the `.env` file created by Terraform: + +```bash +COSMOS_DB_ENDPOINT=... +COSMOS_DB_KEY=... +COSMOS_DB_NAME=... +COSMOS_DB_CONTAINER_NAME=products +COSMOS_SKIP_IF_EXISTS=true # Skip import if data already exists +COSMOS_FORCE_INGEST=false # Force re-import even if data exists +SEARCH_SERVICE_ENDPOINT=... +SEARCH_SERVICE_KEY=... +SEARCH_INDEX_NAME=products-index +AZURE_OPENAI_ENDPOINT=... +AZURE_OPENAI_API_KEY=... +AZURE_OPENAI_EMBEDDING_DEPLOYMENT=text-embedding-3-small +``` + +## Environment Variable Reference + +| Variable | Default | Description | +|----------------------------|---------|-------------------------------------------------------| +| `COSMOS_SKIP_IF_EXISTS` | `true` | Skip import if container already has data | +| `COSMOS_FORCE_INGEST` | `false` | Force re-import even if data exists (overrides skip) | +| `COSMOS_DB_ENDPOINT` | - | Cosmos DB account endpoint URL | +| `COSMOS_DB_KEY` | - | Cosmos DB account key (optional if using AAD) | +| `COSMOS_DB_NAME` | - | Database name | +| `COSMOS_DB_CONTAINER_NAME` | - | Container name for product catalog | + +## Verification + +> After running the pipeline, verify data was imported: + +## Check Cosmos DB + +```powershell +az cosmosdb sql container show \ + --account-name \ + --database-name zava \ + --name products \ + --resource-group +``` + +## Check Search Index + +```powershell +az search index show \ + --index-name products-index \ + --service-name \ + --resource-group +``` + +## Query Search Index + +```powershell +az search index show-statistics \ + --index-name products-index \ + --service-name \ + --resource-group +``` + +## Next Steps + +> After the data pipeline completes: + +1. Your Cosmos DB container is populated with product data +2. Azure AI Search index is created with vector search enabled +3. Search index is populated from Cosmos DB +4. You can now build AI agents that query this data +5. Use the search index for hybrid search (keyword + semantic) + + +
+ Total views +

Refresh Date: 2025-11-24

+
+ diff --git a/src/data/updated_product_catalog(in).csv b/src/data/updated_product_catalog(in).csv new file mode 100644 index 0000000..426e823 --- /dev/null +++ b/src/data/updated_product_catalog(in).csv @@ -0,0 +1,21 @@ +ProductID,ProductName,ProductCategory,ProductDescription,Price,ImageUrl +1001,Zava Smart Speaker,Electronics,Voice-controlled speaker with high-fidelity audio and edge AI noise suppression,89.99,https://example.com/images/speaker.jpg +1002,Zava Wireless Earbuds,Electronics,Comfort-fit earbuds with adaptive EQ and 30h battery life,129.00,https://example.com/images/earbuds.jpg +1003,Zava Fitness Tracker,Sports,Water-resistant tracker with heart rate variability and sleep stage insights,59.50,https://example.com/images/tracker.jpg +1004,Zava Running Shoes,Sports,Breathable mesh performance shoes with responsive foam sole,104.95,https://example.com/images/runningshoes.jpg +1005,Zava Cotton Hoodie,Apparel,Ultra-soft recycled cotton hoodie with antimicrobial treatment,54.99,https://example.com/images/hoodie.jpg +1006,Zava Insulated Bottle,Home,Double-wall stainless steel bottle keeps drinks cold 24h / hot 12h,28.00,https://example.com/images/bottle.jpg +1007,Zava Ceramic Mug,Home,Matte glaze 14oz mug safe for dishwasher and microwave,12.75,https://example.com/images/mug.jpg +1008,Zava Multi-Tool Outdoor,Sports,Compact 11-in-1 stainless multi-tool with locking blades,36.40,https://example.com/images/multitool.jpg +1009,Zava Hair Serum,Beauty,Nutrient-rich lightweight serum for frizz control and shine,24.50,https://example.com/images/hairserum.jpg +1010,Zava Vitamin C Gummies,Grocery,Non-GMO vegan gummies with natural citrus flavor (90 count),18.99,https://example.com/images/vitaminc.jpg +1011,Zava Gaming Mouse,Electronics,Customizable RGB ergonomic mouse with 12K DPI sensor,64.00,https://example.com/images/mouse.jpg +1012,Zava Mechanical Keyboard,Electronics,"Hot-swap switches, per-key lighting, and PBT keycaps",139.95,https://example.com/images/keyboard.jpg +1013,Zava Desk Lamp,Home,Adjustable LED lamp with ambient backlight and USB-C charging port,42.25,https://example.com/images/desklamp.jpg +1014,Zava Noise Masking Device,Electronics,Generates adaptive ambient sound for focus and sleep environments,79.99,https://example.com/images/noisemask.jpg +1015,Zava Travel Backpack,Apparel,Weather-resistant 28L backpack with laptop sleeve and hidden pocket,98.00,https://example.com/images/backpack.jpg +1016,Zava Smart Plug,Electronics,Energy monitoring smart plug with over-current protection,19.95,https://example.com/images/smartplug.jpg +1017,Zava LED Strip Kit,Electronics,16M color Wi-Fi LED strip with music sync mode,34.50,https://example.com/images/ledstrip.jpg +1018,Zava Foam Roller,Sports,"High-density recovery roller improves circulation and muscle release",25.00,https://example.com/images/foamroller.jpg +1019,Zava Sunscreen SPF50,Beauty,Broad-spectrum mineral sunscreen water-resistant for 80 minutes,21.25,https://example.com/images/sunscreen.jpg +1020,Zava Organic Trail Mix,Grocery,"Blend of roasted nuts, seeds, and dried berries (16oz)",11.49,https://example.com/images/trailmix.jpg \ No newline at end of file diff --git a/src/pipelines/create_search_index.py b/src/pipelines/create_search_index.py new file mode 100644 index 0000000..d9e9333 --- /dev/null +++ b/src/pipelines/create_search_index.py @@ -0,0 +1,119 @@ +import logging +import os +from azure.search.documents.indexes import SearchIndexClient +from azure.search.documents.indexes.models import ( + SearchIndex, + SearchField, + SearchFieldDataType, + SimpleField, + SearchableField, + VectorSearch, + HnswAlgorithmConfiguration, + VectorSearchProfile, + AzureOpenAIVectorizer, + AzureOpenAIVectorizerParameters +) +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + +# Configuration +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") +AZURE_OPENAI_ENDPOINT = os.environ.get("AZURE_OPENAI_ENDPOINT") +AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY") +EMBEDDING_DEPLOYMENT = os.environ.get("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small") + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def create_search_index(): + """Create Azure AI Search index with vector search capabilities.""" + + if not SEARCH_ENDPOINT: + raise ValueError("SEARCH_SERVICE_ENDPOINT must be provided in environment variables") + + # Create client + try: + logger.info("Attempting to create Search Index Client...") + if SEARCH_KEY: + credential = AzureKeyCredential(SEARCH_KEY) + else: + credential = DefaultAzureCredential() + + index_client = SearchIndexClient(endpoint=SEARCH_ENDPOINT, credential=credential) + logger.info("Search Index Client created successfully") + except Exception as e: + logger.error(f"Failed to create Search Index Client: {e}") + raise + + # Define the index fields + fields = [ + SimpleField(name="id", type=SearchFieldDataType.String, key=True, filterable=True), + SimpleField(name="ProductID", type=SearchFieldDataType.String, filterable=True), + SearchableField(name="ProductName", type=SearchFieldDataType.String, searchable=True), + SearchableField(name="ProductCategory", type=SearchFieldDataType.String, searchable=True, filterable=True, facetable=True), + SearchableField(name="ProductDescription", type=SearchFieldDataType.String, searchable=True), + SimpleField(name="ProductPrice", type=SearchFieldDataType.Double, filterable=True, sortable=True), + SimpleField(name="ProductImageURL", type=SearchFieldDataType.String), + SearchableField(name="content_for_vector", type=SearchFieldDataType.String, searchable=True), + SearchField( + name="content_vector", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + vector_search_dimensions=1536, # text-embedding-3-small dimensions + vector_search_profile_name="vector-profile" + ) + ] + + # Configure vector search + vector_search = VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration(name="hnsw-algorithm") + ], + profiles=[ + VectorSearchProfile( + name="vector-profile", + algorithm_configuration_name="hnsw-algorithm", + vectorizer_name="openai-vectorizer" + ) + ], + vectorizers=[ + AzureOpenAIVectorizer( + vectorizer_name="openai-vectorizer", + parameters=AzureOpenAIVectorizerParameters( + resource_url=AZURE_OPENAI_ENDPOINT, + deployment_name=EMBEDDING_DEPLOYMENT, + model_name="text-embedding-3-small", # Required in API version 2025-09-01 + api_key=AZURE_OPENAI_API_KEY + ) + ) + ] + ) + + # Create the search index + index = SearchIndex( + name=INDEX_NAME, + fields=fields, + vector_search=vector_search + ) + + try: + logger.info(f"Creating search index: {INDEX_NAME}...") + result = index_client.create_or_update_index(index) + logger.info(f"Search index '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create search index: {e}") + raise + +def main(): + create_search_index() + logger.info("Search index creation completed successfully") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/import_to_search.py b/src/pipelines/import_to_search.py new file mode 100644 index 0000000..873b6ac --- /dev/null +++ b/src/pipelines/import_to_search.py @@ -0,0 +1,149 @@ +import logging +import os +from azure.cosmos import CosmosClient +from azure.search.documents import SearchClient +from azure.search.documents.indexes import SearchIndexerClient +from azure.search.documents.indexes.models import ( + SearchIndexerDataSourceConnection, + SearchIndexerDataContainer, + SearchIndexer, + FieldMapping, + IndexingSchedule +) +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv +import time + +load_dotenv() + +# Configuration +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") +DATASOURCE_NAME = f"{INDEX_NAME}-datasource" +INDEXER_NAME = f"{INDEX_NAME}-indexer" + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def create_cosmos_datasource(): + """Create a data source connection to Cosmos DB.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + # Create the data source connection + container = SearchIndexerDataContainer(name=CONTAINER_NAME) + + data_source_connection = SearchIndexerDataSourceConnection( + name=DATASOURCE_NAME, + type="cosmosdb", + connection_string=f"AccountEndpoint={COSMOS_ENDPOINT};AccountKey={COSMOS_KEY};Database={DATABASE_NAME}", + container=container + ) + + try: + logger.info(f"Creating data source: {DATASOURCE_NAME}...") + result = indexer_client.create_or_update_data_source_connection(data_source_connection) + logger.info(f"Data source '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create data source: {e}") + raise + +def create_indexer(): + """Create an indexer to import data from Cosmos DB to Azure AI Search.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + # Create the indexer + indexer = SearchIndexer( + name=INDEXER_NAME, + data_source_name=DATASOURCE_NAME, + target_index_name=INDEX_NAME, + field_mappings=[ + FieldMapping(source_field_name="id", target_field_name="id"), + FieldMapping(source_field_name="ProductID", target_field_name="ProductID"), + FieldMapping(source_field_name="ProductName", target_field_name="ProductName"), + FieldMapping(source_field_name="ProductCategory", target_field_name="ProductCategory"), + FieldMapping(source_field_name="ProductDescription", target_field_name="ProductDescription"), + FieldMapping(source_field_name="ProductPrice", target_field_name="ProductPrice"), + FieldMapping(source_field_name="ProductImageURL", target_field_name="ProductImageURL"), + FieldMapping(source_field_name="content_for_vector", target_field_name="content_for_vector"), + ] + ) + + try: + logger.info(f"Creating indexer: {INDEXER_NAME}...") + result = indexer_client.create_or_update_indexer(indexer) + logger.info(f"Indexer '{result.name}' created successfully") + return result + except Exception as e: + logger.error(f"Failed to create indexer: {e}") + raise + +def run_indexer(): + """Run the indexer to start data import.""" + + if not SEARCH_KEY: + credential = DefaultAzureCredential() + else: + credential = AzureKeyCredential(SEARCH_KEY) + + indexer_client = SearchIndexerClient(endpoint=SEARCH_ENDPOINT, credential=credential) + + try: + logger.info(f"Running indexer: {INDEXER_NAME}...") + indexer_client.run_indexer(INDEXER_NAME) + logger.info("Indexer started successfully") + + # Wait for indexer to complete + logger.info("Waiting for indexer to complete...") + for i in range(30): # Wait up to 5 minutes + time.sleep(10) + status = indexer_client.get_indexer_status(INDEXER_NAME) + last_result = status.last_result + + if last_result: + logger.info(f"Indexer status: {last_result.status}") + if last_result.status == "success": + logger.info(f"Indexer completed successfully. Indexed {last_result.items_processed} items.") + return + elif last_result.status == "transientFailure" or last_result.status == "persistentFailure": + logger.error(f"Indexer failed: {last_result.error_message}") + raise Exception(f"Indexer failed: {last_result.error_message}") + + logger.warning("Indexer is still running after timeout") + except Exception as e: + logger.error(f"Failed to run indexer: {e}") + raise + +def main(): + # Step 1: Create Cosmos DB data source + create_cosmos_datasource() + + # Step 2: Create indexer + create_indexer() + + # Step 3: Run indexer + run_indexer() + + logger.info("Data import to Azure AI Search completed successfully") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/ingest_to_cosmos.py b/src/pipelines/ingest_to_cosmos.py new file mode 100644 index 0000000..9c84fe3 --- /dev/null +++ b/src/pipelines/ingest_to_cosmos.py @@ -0,0 +1,135 @@ +import logging +import pandas as pd +import os +from azure.cosmos import CosmosClient, PartitionKey +from azure.identity import DefaultAzureCredential +from azure.core.exceptions import AzureError +from dotenv import load_dotenv + +load_dotenv() + +# CONFIGURATIONS - Replace with your actual values +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SKIP_IF_EXISTS = os.environ.get("COSMOS_SKIP_IF_EXISTS", "true").lower() == "true" +FORCE_INGEST = os.environ.get("COSMOS_FORCE_INGEST", "false").lower() == "true" +CSV_FILE = r"data/updated_product_catalog(in).csv" + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def get_cosmos_client(endpoint: str | None, key: str | None = None): + """Try to authenticate to Cosmos DB using DefaultAzureCredential first. + + If that fails, fall back to using the provided key. + Returns a connected CosmosClient instance. + """ + if not endpoint: + raise ValueError("COSMOS_DB_ENDPOINT must be provided in environment variables") + + # Try AAD first + try: + logger.info("Attempting to authenticate to Cosmos DB using DefaultAzureCredential (AAD)...") + credential = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=credential) + + # Perform a light operation to validate the credential + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with DefaultAzureCredential.") + return client + except AzureError as ex: + logger.warning("AAD authentication failed: %s", ex) + + # Fallback to key + if key: + try: + logger.info("Falling back to endpoint + key authentication for Cosmos DB...") + client = CosmosClient(endpoint, key) + # Validate key by a light operation + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with endpoint+key.") + return client + except Exception as ex: + logger.error("Endpoint+key authentication failed: %s", ex) + raise + + # If we reach here, both auth methods failed or no key provided + raise RuntimeError("Failed to authenticate to Cosmos DB using DefaultAzureCredential and no valid COSMOS_DB_KEY was provided") + +def main(): + # 1. Read data from CSV + logger.info(f"Reading data from {CSV_FILE}...") + df = pd.read_csv(CSV_FILE, encoding='utf-8', quoting=1) # quoting=1 is csv.QUOTE_ALL + + # Create content for vector search + df['content_for_vector'] = ( + df['ProductName'].fillna('').astype(str) + ' | ' + + df['ProductCategory'].fillna('').astype(str) + ' | ' + + df['ProductDescription'].fillna('').astype(str) + ) + + logger.info(f"Loaded {len(df)} products from CSV") + + # 2. Connect to Cosmos DB + client = get_cosmos_client(COSMOS_ENDPOINT, COSMOS_KEY) + + if not DATABASE_NAME: + raise ValueError("COSMOS_DB_NAME must be provided in environment variables") + + if not CONTAINER_NAME: + raise ValueError("COSMOS_DB_CONTAINER_NAME must be provided in environment variables") + + database = client.create_database_if_not_exists(id=DATABASE_NAME) + logger.info(f"Connected to database: {DATABASE_NAME}") + + container = database.create_container_if_not_exists( + id=CONTAINER_NAME, + partition_key=PartitionKey(path="/ProductID") + ) + logger.info(f"Connected to container: {CONTAINER_NAME}") + + # Check existing item count (lightweight) + existing_count = 0 + try: + count_query = list(container.query_items( + query="SELECT VALUE COUNT(1) FROM c", + enable_cross_partition_query=True + )) + if count_query: + raw_val = count_query[0] + if isinstance(raw_val, dict): + for k in ("$1", "count", "COUNT"): + if k in raw_val: + raw_val = raw_val[k] + break + if isinstance(raw_val, (int, float, str)): + existing_count = int(raw_val) + except Exception as ex: + logger.warning(f"Count query failed (will ignore): {ex}") + + if existing_count > 0 and SKIP_IF_EXISTS and not FORCE_INGEST: + logger.info( + f"Container already has {existing_count} items. Skipping ingestion (SKIP_IF_EXISTS=true, FORCE_INGEST=false)." + ) + return + + # 3. Upload items + logger.info("Starting data upload to Cosmos DB...") + for idx, row in enumerate(df.itertuples(index=False), start=1): + # Convert row to dict + item = row._asdict() + item['id'] = str(item['ProductID']) + item['ProductID'] = str(item['ProductID']) + + # Insert or update item + container.upsert_item(body=item) + if idx % 10 == 0: + logger.info(f"Uploaded {idx}/{len(df)} products") + + logger.info(f"Successfully uploaded all {len(df)} products to Cosmos DB.") + +if __name__ == "__main__": + main() diff --git a/src/pipelines/upload_to_search.py b/src/pipelines/upload_to_search.py new file mode 100644 index 0000000..f689233 --- /dev/null +++ b/src/pipelines/upload_to_search.py @@ -0,0 +1,122 @@ +import logging +import os +from azure.cosmos import CosmosClient +from azure.search.documents import SearchClient +from azure.core.credentials import AzureKeyCredential +from azure.identity import DefaultAzureCredential +from dotenv import load_dotenv + +load_dotenv() + +# Configuration +COSMOS_ENDPOINT = os.environ.get("COSMOS_DB_ENDPOINT") +COSMOS_KEY = os.environ.get("COSMOS_DB_KEY") +DATABASE_NAME = os.environ.get("COSMOS_DB_NAME") +CONTAINER_NAME = os.environ.get("COSMOS_DB_CONTAINER_NAME") +SEARCH_ENDPOINT = os.environ.get("SEARCH_SERVICE_ENDPOINT") +SEARCH_KEY = os.environ.get("SEARCH_SERVICE_KEY") +INDEX_NAME = os.environ.get("SEARCH_INDEX_NAME", "products-index") + +# Configure logging +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +def get_cosmos_client(endpoint: str, key: str | None = None): + """Get Cosmos DB client with AAD or key-based auth.""" + if not endpoint: + raise ValueError("COSMOS_DB_ENDPOINT must be provided") + + # Try AAD first + try: + logger.info("Attempting to authenticate to Cosmos DB using DefaultAzureCredential (AAD)...") + credential = DefaultAzureCredential() + client = CosmosClient(endpoint, credential=credential) + # Validate + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with DefaultAzureCredential.") + return client + except Exception as ex: + logger.warning(f"AAD authentication failed: {ex}") + + # Fallback to key + if key: + try: + logger.info("Falling back to key-based authentication for Cosmos DB...") + client = CosmosClient(endpoint, key) + # Validate + _ = list(client.list_databases()) + logger.info("Authenticated to Cosmos DB with key.") + return client + except Exception as ex: + logger.error(f"Key authentication failed: {ex}") + + raise RuntimeError("Failed to authenticate to Cosmos DB") + +def upload_documents_to_search(): + """Read documents from Cosmos DB and upload directly to Azure AI Search.""" + + # Connect to Cosmos DB + cosmos_client = get_cosmos_client(COSMOS_ENDPOINT, COSMOS_KEY) + database = cosmos_client.get_database_client(DATABASE_NAME) + container = database.get_container_client(CONTAINER_NAME) + + # Get all documents from Cosmos DB + logger.info(f"Reading documents from Cosmos DB container: {CONTAINER_NAME}...") + query = "SELECT * FROM c" + items = list(container.query_items(query=query, enable_cross_partition_query=True)) + logger.info(f"Retrieved {len(items)} documents from Cosmos DB") + + if len(items) == 0: + logger.warning("No documents found in Cosmos DB container") + return + + # Connect to Search + if SEARCH_KEY: + search_credential = AzureKeyCredential(SEARCH_KEY) + else: + search_credential = DefaultAzureCredential() + + search_client = SearchClient(endpoint=SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=search_credential) + + # Prepare documents for upload + documents = [] + for item in items: + # Map Cosmos DB fields to Search index fields + doc = { + "id": str(item.get("id", item.get("ProductID"))), # Use Cosmos id or ProductID + "ProductID": str(item.get("ProductID")), + "ProductName": item.get("ProductName"), + "ProductCategory": item.get("ProductCategory"), + "ProductDescription": item.get("ProductDescription"), + "ProductPrice": float(item.get("Price", item.get("ProductPrice", 0.0))), + "ProductImageURL": item.get("ImageUrl", item.get("ProductImageURL", "")), + "content_for_vector": item.get("content_for_vector", "") + } + documents.append(doc) + + # Upload documents in batches + logger.info(f"Uploading {len(documents)} documents to Azure AI Search index: {INDEX_NAME}...") + try: + result = search_client.upload_documents(documents=documents) + success_count = sum(1 for r in result if r.succeeded) + failed_count = len(result) - success_count + + logger.info(f"Upload completed: {success_count} succeeded, {failed_count} failed") + + if failed_count > 0: + for r in result: + if not r.succeeded: + logger.error(f"Failed to upload document {r.key}: {r.error_message}") + + return success_count + except Exception as e: + logger.error(f"Failed to upload documents to search: {e}") + raise + +def main(): + logger.info("Starting data upload from Cosmos DB to Azure AI Search...") + count = upload_documents_to_search() + logger.info(f"Data upload completed successfully. {count} documents uploaded.") + +if __name__ == "__main__": + main() diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..8f79d8d --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,8 @@ +requests==2.32.3 +python-dotenv==1.0.1 +pandas>=2.2.2 +azure-cosmos==4.9.0 +azure-identity==1.19.0 +azure-search-documents==11.6.0 +openai==1.54.5 +azure-ai-inference==1.0.0b6 diff --git a/src/verify_data.py b/src/verify_data.py new file mode 100644 index 0000000..e30444e --- /dev/null +++ b/src/verify_data.py @@ -0,0 +1,24 @@ +from azure.cosmos import CosmosClient +from azure.identity import DefaultAzureCredential +import os +from dotenv import load_dotenv +import json + +load_dotenv() + +credential = DefaultAzureCredential() +client = CosmosClient(os.environ['COSMOS_DB_ENDPOINT'], credential) +db = client.get_database_client(os.environ['COSMOS_DB_NAME']) +container = db.get_container_client(os.environ['COSMOS_DB_CONTAINER_NAME']) + +# Count total items +count = list(container.query_items('SELECT VALUE COUNT(1) FROM c', enable_cross_partition_query=True))[0] +print(f'✓ Total items in Cosmos DB container: {count}') + +# Get sample products +items = list(container.query_items('SELECT TOP 3 c.ProductID, c.ProductName, c.ProductCategory, c.Price FROM c ORDER BY c.ProductID', enable_cross_partition_query=True)) +print(f'\n✓ Sample products:') +for item in items: + print(f" - {item['ProductID']}: {item['ProductName']} ({item['ProductCategory']}) - ${item['Price']}") + +print('\n✓ Data successfully loaded into Cosmos DB!') diff --git a/src/verify_search.py b/src/verify_search.py new file mode 100644 index 0000000..e11e63e --- /dev/null +++ b/src/verify_search.py @@ -0,0 +1,27 @@ +from azure.search.documents import SearchClient +from azure.core.credentials import AzureKeyCredential +from dotenv import load_dotenv +import os + +load_dotenv() + +credential = AzureKeyCredential(os.environ['SEARCH_SERVICE_KEY']) +client = SearchClient( + endpoint=os.environ['SEARCH_SERVICE_ENDPOINT'], + index_name=os.environ['SEARCH_INDEX_NAME'], + credential=credential +) + +# Count documents +results = client.search(search_text='*', include_total_count=True) +total_count = results.get_count() +print(f'✓ Total documents in Azure AI Search index: {total_count}') + +# Show sample products +print(f'\n✓ Sample products:') +for i, doc in enumerate(results): + print(f" - {doc['ProductID']}: {doc['ProductName']} ({doc['ProductCategory']}) - ${doc['ProductPrice']}") + if i >= 2: + break + +print('\n✓ Data successfully loaded into Azure AI Search!') diff --git a/terraform-infrastructure/README.md b/terraform-infrastructure/README.md index cb38b65..39c766e 100644 --- a/terraform-infrastructure/README.md +++ b/terraform-infrastructure/README.md @@ -119,7 +119,7 @@ graph TD;
- Total views -

Refresh Date: 2025-11-22

+ Total views +

Refresh Date: 2025-11-24

diff --git a/terraform-infrastructure/main.tf b/terraform-infrastructure/main.tf index ece0b39..652b2f0 100644 --- a/terraform-infrastructure/main.tf +++ b/terraform-infrastructure/main.tf @@ -15,19 +15,20 @@ resource "random_id" "suffix" { locals { # Use provided user_principal_id or default to current Azure CLI user - principal_id = var.user_principal_id != null ? var.user_principal_id : data.azurerm_client_config.current.object_id - suffix = substr(random_id.suffix.hex, 0, 8) - cosmos_account_name = "${var.name_prefix}${local.suffix}cosmosdb" - cosmos_db_name = "zava" - storage_account = lower(replace("${var.name_prefix}${local.suffix}sa", "-", "")) - ai_foundry_name = "aif-${local.suffix}" # custom subdomain - ai_project_name = "proj-${local.suffix}" - search_service_name = "${var.name_prefix}-${local.suffix}-search" - app_service_plan = "${var.name_prefix}-${local.suffix}-asp" - log_analytics_name = "${var.name_prefix}-${local.suffix}-la" - app_insights_name = "${var.name_prefix}-${local.suffix}-ai" - registry_name = lower(replace("${var.name_prefix}${local.suffix}cosureg", "-", "")) - web_app_name = "${var.name_prefix}-${local.suffix}-app" + principal_id = var.user_principal_id != null ? var.user_principal_id : data.azurerm_client_config.current.object_id + suffix = substr(random_id.suffix.hex, 0, 8) + cosmos_account_name = "${var.name_prefix}${local.suffix}cosmosdb" + cosmos_db_name = "zava" + storage_account = lower(replace("${var.name_prefix}${local.suffix}sa", "-", "")) + ai_foundry_name = "aif-${local.suffix}" # custom subdomain + ai_project_name = "proj-${local.suffix}" + search_service_name = "${var.name_prefix}-${local.suffix}-search" + app_service_plan = "${var.name_prefix}-${local.suffix}-asp" + log_analytics_name = "${var.name_prefix}-${local.suffix}-la" + app_insights_name = "${var.name_prefix}-${local.suffix}-ai" + registry_name = lower(replace("${var.name_prefix}${local.suffix}cosureg", "-", "")) + web_app_name = "${var.name_prefix}-${local.suffix}-app" + cosmos_connection_auth_type = var.enable_cosmos_local_auth ? "AccountKey" : "AAD" } resource "azurerm_cosmosdb_account" "cosmos" { @@ -45,9 +46,9 @@ resource "azurerm_cosmosdb_account" "cosmos" { location = var.location failover_priority = 0 } - free_tier_enabled = false - analytical_storage_enabled = false - local_authentication_disabled = !var.enable_cosmos_local_auth + free_tier_enabled = false + analytical_storage_enabled = false + local_authentication_disabled = !var.enable_cosmos_local_auth } resource "azurerm_cosmosdb_sql_database" "cosmosdb" { @@ -57,28 +58,37 @@ resource "azurerm_cosmosdb_sql_database" "cosmosdb" { throughput = 400 } +resource "azurerm_cosmosdb_sql_container" "products" { + name = "product_catalog" + resource_group_name = azurerm_resource_group.rg.name + account_name = azurerm_cosmosdb_account.cosmos.name + database_name = azurerm_cosmosdb_sql_database.cosmosdb.name + partition_key_paths = ["/ProductID"] + throughput = 400 +} + # Storage account using AzAPI to bypass policy restrictions resource "azapi_resource" "storage" { type = "Microsoft.Storage/storageAccounts@2023-01-01" name = local.storage_account location = var.location parent_id = azurerm_resource_group.rg.id - + body = jsonencode({ sku = { name = "Standard_LRS" } kind = "StorageV2" properties = { - accessTier = "Hot" - allowSharedKeyAccess = true + accessTier = "Hot" + allowSharedKeyAccess = true defaultToOAuthAuthentication = false - allowBlobPublicAccess = false - minimumTlsVersion = "TLS1_2" - supportsHttpsTrafficOnly = true + allowBlobPublicAccess = false + minimumTlsVersion = "TLS1_2" + supportsHttpsTrafficOnly = true } }) - + identity { type = "SystemAssigned" } @@ -86,10 +96,10 @@ resource "azapi_resource" "storage" { # AI Foundry account (preview) using AzAPI provider. resource "azapi_resource" "ai_foundry" { - type = "Microsoft.CognitiveServices/accounts@2025-06-01" - name = local.ai_foundry_name - location = var.location - parent_id = azurerm_resource_group.rg.id + type = "Microsoft.CognitiveServices/accounts@2025-06-01" + name = local.ai_foundry_name + location = var.location + parent_id = azurerm_resource_group.rg.id schema_validation_enabled = false identity { type = "SystemAssigned" } body = jsonencode({ @@ -104,13 +114,13 @@ resource "azapi_resource" "ai_foundry" { } resource "azapi_resource" "ai_project" { - type = "Microsoft.CognitiveServices/accounts/projects@2025-06-01" - name = local.ai_project_name - location = var.location - parent_id = azapi_resource.ai_foundry.id + type = "Microsoft.CognitiveServices/accounts/projects@2025-06-01" + name = local.ai_project_name + location = var.location + parent_id = azapi_resource.ai_foundry.id schema_validation_enabled = false identity { type = "SystemAssigned" } - body = jsonencode({ properties = {} }) + body = jsonencode({ properties = {} }) depends_on = [azapi_resource.ai_foundry] } @@ -157,7 +167,7 @@ resource "azurerm_container_registry_webhook" "webhook" { status = "enabled" scope = "${local.suffix}/techworkshopl300/zava:latest" actions = ["push"] - + custom_headers = { "Content-Type" = "application/json" } @@ -185,7 +195,7 @@ resource "azurerm_linux_web_app" "app" { docker_image_name = "${local.registry_name}.azurecr.io/${local.suffix}/techworkshopl300/zava:latest" docker_registry_url = "https://${local.registry_name}.azurecr.io" } - http2_enabled = true + http2_enabled = true minimum_tls_version = "1.2" } @@ -297,7 +307,7 @@ resource "azurerm_role_assignment" "storage_blob_data_contributor_project" { # Azure AI model deployments automation resource "null_resource" "ai_model_deployments" { count = var.enable_ai_automation ? 1 : 0 - + depends_on = [ azapi_resource.ai_project, azapi_resource.ai_foundry, @@ -305,7 +315,7 @@ resource "null_resource" "ai_model_deployments" { ] provisioner "local-exec" { - command = <<-EOT + command = <<-EOT # Create AI model deployments Write-Host "Creating Azure AI model deployments..." @@ -395,58 +405,221 @@ resource "null_resource" "ai_model_deployments" { } } -# Connect resources to Azure AI Foundry project -resource "null_resource" "ai_project_connections" { +# Connection helper actions for Foundry resources +data "azapi_resource_action" "storage_list_keys" { + count = var.enable_ai_automation ? 1 : 0 + type = "Microsoft.Storage/storageAccounts@2023-01-01" + resource_id = azapi_resource.storage.id + action = "listKeys" + response_export_values = ["keys"] + body = jsonencode({}) + depends_on = [azapi_resource.storage] +} + +data "azapi_resource_action" "search_admin_keys" { + count = var.enable_ai_automation ? 1 : 0 + type = "Microsoft.Search/searchServices@2025-02-01-preview" + resource_id = azurerm_search_service.search.id + action = "listAdminKeys" + response_export_values = ["primaryKey"] + body = jsonencode({}) + depends_on = [azurerm_search_service.search] +} + +data "azapi_resource_action" "cosmos_keys" { + count = (var.enable_ai_automation && var.enable_cosmos_local_auth) ? 1 : 0 + type = "Microsoft.DocumentDB/databaseAccounts@2024-11-15" + resource_id = azurerm_cosmosdb_account.cosmos.id + action = "listKeys" + response_export_values = ["primaryMasterKey"] + body = jsonencode({}) + depends_on = [azurerm_cosmosdb_account.cosmos] +} + +# Connect resources to Azure AI Foundry project using ARM templates +resource "azapi_resource" "storage_connection" { count = var.enable_ai_automation ? 1 : 0 - + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-storage" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azapi_resource.storage, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = { + category = "AzureStorageAccount" + target = "https://${local.storage_account}.blob.core.windows.net" + authType = "AccountKey" + isSharedToAll = true + credentials = { + key = jsondecode(data.azapi_resource_action.storage_list_keys[0].output).keys[0].value + } + metadata = { + ApiType = "Azure" + ResourceId = azapi_resource.storage.id + } + } + }) +} + +resource "azapi_resource" "app_insights_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-appinsights" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + depends_on = [ - null_resource.ai_model_deployments, azurerm_application_insights.appinsights, - azapi_resource.storage + azapi_resource.ai_foundry ] - provisioner "local-exec" { - command = <<-EOT - Write-Host "Verifying Azure AI Foundry project configuration..." - - # Check if Azure ML extension is installed - $mlExtension = az extension list --query "[?name=='ml'].name" --output tsv - if (-not $mlExtension) { - Write-Host "Installing Azure ML extension..." - az extension add --name ml + body = jsonencode({ + properties = { + category = "AppInsights" + target = azurerm_application_insights.appinsights.id + authType = "ApiKey" + isSharedToAll = true + credentials = { + key = azurerm_application_insights.appinsights.connection_string } - - # Set the AI project as the default workspace for future ML operations - az config set defaults.workspace="${local.ai_project_name}" - az config set defaults.group="${azurerm_resource_group.rg.name}" - - Write-Host "Azure AI project configuration completed successfully." - Write-Host "Project Name: ${local.ai_project_name}" + metadata = { + ApiType = "Azure" + ResourceId = azurerm_application_insights.appinsights.id + } + } + }) +} + +resource "azapi_resource" "search_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-aisearch" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azurerm_search_service.search, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = { + category = "CognitiveSearch" + target = "https://${local.search_service_name}.search.windows.net" + authType = "ApiKey" + isSharedToAll = true + credentials = { + key = jsondecode(data.azapi_resource_action.search_admin_keys[0].output).primaryKey + } + metadata = { + ApiType = "Azure" + ResourceId = azurerm_search_service.search.id + location = azurerm_search_service.search.location + } + } + }) +} + +resource "azapi_resource" "cosmos_connection" { + count = var.enable_ai_automation ? 1 : 0 + + type = "Microsoft.CognitiveServices/accounts/connections@2025-04-01-preview" + name = "${local.ai_foundry_name}-cosmosdb" + parent_id = azapi_resource.ai_foundry.id + schema_validation_enabled = false + + depends_on = [ + azurerm_cosmosdb_account.cosmos, + azapi_resource.ai_foundry + ] + + body = jsonencode({ + properties = merge({ + category = "CosmosDb" + target = azurerm_cosmosdb_account.cosmos.endpoint + authType = local.cosmos_connection_auth_type + isSharedToAll = true + metadata = { + ApiType = "Azure" + ResourceId = azurerm_cosmosdb_account.cosmos.id + location = azurerm_cosmosdb_account.cosmos.location + } + }, var.enable_cosmos_local_auth ? { + credentials = { + key = jsondecode(data.azapi_resource_action.cosmos_keys[0].output).primaryMasterKey + } + } : {}) + }) +} + +# Verification script for connections +resource "null_resource" "verify_connections" { + count = var.enable_ai_automation ? 1 : 0 + + depends_on = [ + azapi_resource.storage_connection, + azapi_resource.app_insights_connection, + azapi_resource.search_connection, + azapi_resource.cosmos_connection + ] + + provisioner "local-exec" { + command = <<-EOT + Write-Host "=== Verifying Microsoft Foundry Project Connections ===" + Write-Host "" + Write-Host "Project: ${local.ai_project_name}" Write-Host "AI Foundry: ${local.ai_foundry_name}" Write-Host "Resource Group: ${azurerm_resource_group.rg.name}" + Write-Host "" + + # List connections using Azure CLI + Write-Host "Checking connections via Azure CLI..." + az rest --method GET --url "https://management.azure.com/subscriptions/${data.azurerm_client_config.current.subscription_id}/resourceGroups/${azurerm_resource_group.rg.name}/providers/Microsoft.CognitiveServices/accounts/${local.ai_foundry_name}/connections?api-version=2025-06-01" --query "value[].{Name:name,Type:properties.connectionType,Target:properties.target}" --output table + + Write-Host "" + Write-Host "✓ Microsoft Foundry project connections verification completed!" + Write-Host "" + Write-Host "Available connections:" + Write-Host " - Storage Account: ${local.storage_account}" + Write-Host " - Application Insights: ${local.app_insights_name}" + Write-Host " - Azure AI Search: ${local.search_service_name}" + Write-Host " - Cosmos DB: ${local.cosmos_account_name}" + Write-Host "" + Write-Host "View in Azure Portal:" + Write-Host " https://ai.azure.com/resource/overview/${local.ai_foundry_name}" + Write-Host " Navigate to Management center > Connected resources" EOT interpreter = ["PowerShell", "-Command"] } triggers = { - storage_id = azapi_resource.storage.id - app_insights_id = azurerm_application_insights.appinsights.id - ai_project_id = azapi_resource.ai_project.id + storage_conn = var.enable_ai_automation ? azapi_resource.storage_connection[0].id : "" + app_insights_conn = var.enable_ai_automation ? azapi_resource.app_insights_connection[0].id : "" + search_conn = var.enable_ai_automation ? azapi_resource.search_connection[0].id : "" + cosmos_conn = var.enable_ai_automation ? azapi_resource.cosmos_connection[0].id : "" } } # Create .env file with all necessary configuration resource "null_resource" "create_env_file" { count = var.enable_ai_automation ? 1 : 0 - + depends_on = [ - null_resource.ai_project_connections, + null_resource.verify_connections, azurerm_cosmosdb_account.cosmos, azurerm_search_service.search ] provisioner "local-exec" { - command = <<-EOT + command = <<-EOT Write-Host "Creating .env file with Azure resource configuration..." # Create src directory if it doesn't exist @@ -509,7 +682,9 @@ AZURE_OPENAI_API_VERSION=2024-02-01 COSMOS_DB_ENDPOINT=${azurerm_cosmosdb_account.cosmos.endpoint} COSMOS_DB_KEY=$cosmosKey COSMOS_DB_NAME=${local.cosmos_db_name} -COSMOS_DB_CONTAINER_NAME=products +COSMOS_DB_CONTAINER_NAME=product_catalog +COSMOS_SKIP_IF_EXISTS=true +COSMOS_FORCE_INGEST=false # Azure AI Search Configuration SEARCH_SERVICE_ENDPOINT=https://${local.search_service_name}.search.windows.net @@ -546,7 +721,9 @@ AZURE_OPENAI_API_VERSION=2024-02-01 COSMOS_DB_ENDPOINT=${azurerm_cosmosdb_account.cosmos.endpoint} COSMOS_DB_KEY=$cosmosKey COSMOS_DB_NAME=${local.cosmos_db_name} -COSMOS_DB_CONTAINER_NAME=products +COSMOS_DB_CONTAINER_NAME=product_catalog +COSMOS_SKIP_IF_EXISTS=true +COSMOS_FORCE_INGEST=false # Azure AI Search Configuration SEARCH_SERVICE_ENDPOINT=https://${local.search_service_name}.search.windows.net @@ -589,11 +766,118 @@ AZURE_LOCATION=${var.location} triggers = { # Trigger recreation when any of these resources change - ai_foundry_id = azapi_resource.ai_foundry.id - ai_project_id = azapi_resource.ai_project.id - cosmos_id = azurerm_cosmosdb_account.cosmos.id - search_id = azurerm_search_service.search.id - storage_id = azapi_resource.storage.id + ai_foundry_id = azapi_resource.ai_foundry.id + ai_project_id = azapi_resource.ai_project.id + cosmos_id = azurerm_cosmosdb_account.cosmos.id + search_id = azurerm_search_service.search.id + storage_id = azapi_resource.storage.id app_insights_id = azurerm_application_insights.appinsights.id } } + +# Data pipeline automation - runs after .env file is created +resource "null_resource" "data_pipeline" { + count = var.enable_data_pipeline ? 1 : 0 + + depends_on = [ + null_resource.create_env_file, + azurerm_cosmosdb_sql_database.cosmosdb, + azurerm_cosmosdb_sql_container.products + ] + + provisioner "local-exec" { + command = <<-EOT + Write-Host "Starting data pipeline automation..." + + # Navigate to src directory + cd ../src + + # Check if Python is available + try { + $pythonCmd = (Get-Command python -ErrorAction Stop).Source + Write-Host "Found Python at: $pythonCmd" + } catch { + Write-Host "ERROR: Python is not installed or not in PATH" + Write-Host "Please install Python 3.8+ from https://www.python.org/downloads/" + exit 1 + } + + # Create virtual environment + Write-Host "Creating Python virtual environment..." + if (Test-Path "venv") { + Write-Host "Virtual environment already exists, removing..." + Remove-Item -Recurse -Force venv + } + python -m venv venv + + # Install dependencies directly to venv without activation + Write-Host "Installing Python dependencies (with retry)..." + $pythonExe = "venv\Scripts\python.exe" + $pipExe = "venv\Scripts\pip.exe" + + if (Test-Path $pythonExe) { + & $pythonExe -m pip install --upgrade pip + $maxAttempts = 3 + for ($i = 1; $i -le $maxAttempts; $i++) { + Write-Host "pip install attempt $i..." + & $pipExe install -r requirements.txt + if ($LASTEXITCODE -eq 0) { + Write-Host "Dependencies installed successfully on attempt $i" + break + } else { + Write-Host "pip install failed (exit $LASTEXITCODE)." + if ($i -lt $maxAttempts) { + Write-Host "Retrying after short backoff..." + Start-Sleep -Seconds 5 + } else { + Write-Host "ERROR: Dependencies failed after $maxAttempts attempts" + exit 1 + } + } + } + + Write-Host "Python environment ready" + Write-Host "" + + # Check if CSV data file exists + $csvFile = "data/updated_product_catalog(in).csv" + if (!(Test-Path $csvFile)) { + Write-Host "WARNING: CSV data file not found at $csvFile" + Write-Host "Please download the product catalog data or place it in the data directory" + Write-Host "Skipping data import for now" + } else { + Write-Host "Step 1: Importing data to Cosmos DB (skip logic flags: COSMOS_SKIP_IF_EXISTS / COSMOS_FORCE_INGEST)..." + & $pythonExe pipelines/ingest_to_cosmos.py + + Write-Host "" + Write-Host "Step 2: Creating Azure AI Search index..." + & $pythonExe pipelines/create_search_index.py + + Write-Host "" + Write-Host "Step 3: Uploading data from Cosmos DB to Azure AI Search..." + & $pythonExe pipelines/upload_to_search.py + + Write-Host "" + Write-Host "Data pipeline completed successfully!" + Write-Host "- Cosmos DB container created and populated" + Write-Host "- Azure AI Search index created" + Write-Host "- Data imported to search index" + } + } else { + Write-Host "ERROR: Failed to create virtual environment" + exit 1 + } + + Write-Host "" + Write-Host "Data pipeline automation completed" + EOT + interpreter = ["PowerShell", "-Command"] + working_dir = path.module + } + + triggers = { + cosmos_db_id = azurerm_cosmosdb_sql_database.cosmosdb.id + search_id = azurerm_search_service.search.id + env_file_id = null_resource.create_env_file[0].id + } +} diff --git a/terraform-infrastructure/terraform.tfvars b/terraform-infrastructure/terraform.tfvars index 0953c4e..2cc89f4 100644 --- a/terraform-infrastructure/terraform.tfvars +++ b/terraform-infrastructure/terraform.tfvars @@ -1,4 +1,4 @@ -resource_group_name = "RG-AI-retailw3" +resource_group_name = "RG-AI-retailbrw5" location = "westus3" name_prefix = "zava" # user_principal_id is optional - defaults to current Azure CLI user (az login) diff --git a/terraform-infrastructure/variables.tf b/terraform-infrastructure/variables.tf index bda5d3d..f69a72e 100644 --- a/terraform-infrastructure/variables.tf +++ b/terraform-infrastructure/variables.tf @@ -32,3 +32,10 @@ variable "enable_ai_automation" { description = "Whether to run Azure AI Foundry automation steps (model deployments, connections, .env creation)" default = true } + +variable "enable_data_pipeline" { + type = bool + description = "Whether to run data pipeline automation (requires Python and data files)" + default = true +} +