diff --git a/python/chatwithdata/common_utils.py b/python/chatwithdata/common_utils.py new file mode 100644 index 0000000..d30fb29 --- /dev/null +++ b/python/chatwithdata/common_utils.py @@ -0,0 +1,44 @@ +"""Common utility functions for the search module.""" + +import argparse +from urllib.parse import urlparse + + +def absolute_url(value): + """ + Validate that the input is an absolute URL with a valid scheme and netloc. + + Args: + value (str): The URL to validate. + Raises: + argparse.ArgumentTypeError: If the URL is not absolute or does not have a valid scheme and netloc. + Returns: + str: The validated absolute URL. + """ + parsed = urlparse(value) + # Check if the scheme and netloc are present + if not parsed.scheme or not parsed.netloc: + raise argparse.ArgumentTypeError(f"'{value}' is not a valid absolute URL") + return value + + +def valid_name(value): + """ + Validate that the input is a valid name that may include alphanumeric symbols, "-" or "_". + The method doesn't check a specific length and case. + + Args: + value (str): The name to validate. + Raises: + argparse.ArgumentTypeError: If the name is empty, contains only whitespace, or has invalid characters. + Returns: + str: The validated name. + """ + if not value or not value.strip(): + raise argparse.ArgumentTypeError(f"'{value}' is not a valid name") + parsed_value = value.replace("-", "").replace("_", "") + if not parsed_value.isalnum(): + raise argparse.ArgumentTypeError( + f"'{value}' contains invalid characters. Look at the documentation for naming conventions." + ) + return value diff --git a/python/chatwithdata/data/Benefit_Options.pdf b/python/chatwithdata/data/Benefit_Options.pdf new file mode 100644 index 0000000..6a4c07d Binary files /dev/null and b/python/chatwithdata/data/Benefit_Options.pdf differ diff --git a/python/chatwithdata/data/Northwind_Health_Plus_Benefits_Details.pdf b/python/chatwithdata/data/Northwind_Health_Plus_Benefits_Details.pdf new file mode 100644 index 0000000..97579a4 Binary files /dev/null and b/python/chatwithdata/data/Northwind_Health_Plus_Benefits_Details.pdf differ diff --git a/python/chatwithdata/data/Northwind_Standard_Benefits_Details.pdf b/python/chatwithdata/data/Northwind_Standard_Benefits_Details.pdf new file mode 100644 index 0000000..7d50ff8 Binary files /dev/null and b/python/chatwithdata/data/Northwind_Standard_Benefits_Details.pdf differ diff --git a/python/chatwithdata/data/PerksPlus.pdf b/python/chatwithdata/data/PerksPlus.pdf new file mode 100644 index 0000000..2e167a2 Binary files /dev/null and b/python/chatwithdata/data/PerksPlus.pdf differ diff --git a/python/chatwithdata/data/employee_handbook.pdf b/python/chatwithdata/data/employee_handbook.pdf new file mode 100644 index 0000000..878f36f Binary files /dev/null and b/python/chatwithdata/data/employee_handbook.pdf differ diff --git a/python/chatwithdata/data/readme.md b/python/chatwithdata/data/readme.md new file mode 100644 index 0000000..ff16423 --- /dev/null +++ b/python/chatwithdata/data/readme.md @@ -0,0 +1,37 @@ +# Required data + +Initial data to upload into a blob storage to make this template working are located in this folder in pdf format. + +To upload initial data into the blob storage you can pick one of the options from below. + +_Note: By default, this solution deploys all Azure resources to a VNet. To successfully execute any of the commands outlined in this README, they must be run from within the network._ + +## How to upload data into the storage using the Python script + +To execute the upload_data.py script, ensure you have Python 3.8+ installed and the required dependencies (azure-identity and azure-storage-blob) by running `pip install azure-identity azure-storage-blob`. Authenticate to Azure using `az login` or environment variables for service principal credentials. + +Run the script from the terminal with the following command: + +python -m upload_data --storage_name --container_name + +Replace and with your Azure Storage account and container names. The script uploads all .pdf files from its directory to the specified container, creating the container if it doesn't exist. Ensure the storage account name is lowercase and contains only letters. Logs will confirm the upload process. + +## How to upload data using the Linux Shell Script + +Authenticate to Azure using `az login` or environment variables for service principal credentials. Execute the script: + +```bash +./upload_data.sh +``` + +Replace and with your Azure Storage account and container names. + +## How to upload data using the PowerShell Script + +Authenticate to Azure using `az login` or environment variables for service principal credentials. Execute the PowerShell script: + +```bash +./upload_data.ps1 -StorageAccountName -ContainerName +``` + +Replace and with your Azure Storage account and container names. diff --git a/python/chatwithdata/data/requirements.txt b/python/chatwithdata/data/requirements.txt new file mode 100644 index 0000000..57df90d --- /dev/null +++ b/python/chatwithdata/data/requirements.txt @@ -0,0 +1,2 @@ +azure-storage-blob>=12.19.0 +azure-identity>=1.16.1 diff --git a/python/chatwithdata/data/role_library.pdf b/python/chatwithdata/data/role_library.pdf new file mode 100644 index 0000000..ff70c65 Binary files /dev/null and b/python/chatwithdata/data/role_library.pdf differ diff --git a/python/chatwithdata/data/upload_data.ps1 b/python/chatwithdata/data/upload_data.ps1 new file mode 100644 index 0000000..221f2da --- /dev/null +++ b/python/chatwithdata/data/upload_data.ps1 @@ -0,0 +1,75 @@ +<# +.SYNOPSIS + Uploads all PDF files from the current directory to an Azure Blob Storage container. + +.DESCRIPTION + This script uses the Azure CLI to authenticate and upload files to Azure Blob Storage. + It checks if the specified container exists and creates it if necessary. + +.PARAMETER StorageAccountName + The name of the Azure Storage account. + +.PARAMETER ContainerName + The name of the Azure Blob Storage container. + +.EXAMPLE + ./upload_data.ps1 -StorageAccountName "mystorageaccount" -ContainerName "mycontainer" +#> + +param ( + [Parameter(Mandatory = $true)] + [string]$StorageAccountName, + + [Parameter(Mandatory = $true)] + [string]$ContainerName +) + +# Get the current directory +$LocalFolder = Get-Location + +# Check if the container exists, and create it if it doesn't +Write-Host "Checking if container '$ContainerName' exists in storage account '$StorageAccountName'..." +$ContainerExists = az storage container exists ` + --account-name $StorageAccountName ` + --name $ContainerName ` + --auth-mode login ` + --query "exists" ` + --output tsv + +if ($ContainerExists -ne "true") { + Write-Host "Container '$ContainerName' does not exist. Creating it..." + az storage container create ` + --account-name $StorageAccountName ` + --name $ContainerName ` + --auth-mode login ` + --output none + if ($LASTEXITCODE -ne 0) { + Write-Host "Failed to create container '$ContainerName'." -ForegroundColor Red + exit 1 + } + Write-Host "Container '$ContainerName' created successfully." -ForegroundColor Green +} else { + Write-Host "Container '$ContainerName' already exists." -ForegroundColor Green +} + +# Upload all PDF files from the current directory +Write-Host "Uploading PDF files from '$LocalFolder' to container '$ContainerName'..." +Get-ChildItem -Path $LocalFolder -Recurse -Filter *.pdf | ForEach-Object { + $FilePath = $_.FullName + $BlobName = $FilePath.Substring($LocalFolder.Length + 1) -replace '\\', '/' + Write-Host "Uploading '$FilePath' as blob '$BlobName'..." + az storage blob upload ` + --account-name $StorageAccountName ` + --container-name $ContainerName ` + --name $BlobName ` + --auth-mode login ` + --file $FilePath ` + --overwrite + if ($LASTEXITCODE -ne 0) { + Write-Host "Failed to upload '$FilePath'." -ForegroundColor Red + } else { + Write-Host "Uploaded '$FilePath' successfully." -ForegroundColor Green + } +} + +Write-Host "Upload process completed." -ForegroundColor Cyan diff --git a/python/chatwithdata/data/upload_data.py b/python/chatwithdata/data/upload_data.py new file mode 100644 index 0000000..a5a8771 --- /dev/null +++ b/python/chatwithdata/data/upload_data.py @@ -0,0 +1,113 @@ +""" +Initialize blob storage with local data. + +We assume that this code will be executed just once to prepare a blob container for experiments. +""" + +import argparse +import logging +import os +from pathlib import Path + +from azure.identity import DefaultAzureCredential +from azure.storage.blob import BlobServiceClient + +logger = logging.getLogger(__name__) + +# Setting the threshold of logger to DEBUG +logger.setLevel(logging.DEBUG) + +# Create a console handler and set its level to DEBUG +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) + +# Create a formatter and set it for the console handler +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) + +# Add the console handler to the logger +logger.addHandler(console_handler) + +STORAGE_ACCOUNT_URL = "https://{storage_account_name}.blob.core.windows.net" + + +def upload_data_files( + credential: DefaultAzureCredential, + storage_account_name: str, + storage_container: str, + local_folder: str, +): + + account_url = STORAGE_ACCOUNT_URL.format(storage_account_name=storage_account_name) + blob_service_client = BlobServiceClient(account_url=account_url, credential=credential) + blob_container_client = blob_service_client.get_container_client(storage_container) + + if not blob_container_client.exists(): + logger.info(f"Creating {storage_container} container.") + blob_container_client.create_container() + logger.info("Done.") + + for file in Path(local_folder).rglob("*.pdf"): + logger.info(f"Uploading {file} to {storage_container}.") + + # construct blob name from file path + # everything rather than local_folder + file_subpath = os.path.relpath(file, start=local_folder) + + # generate a unique name of the file + file_name = file_subpath.replace(os.sep, "_") + + try: + logger.info(f"Ready to copy: {str(file)} to {file_name}.") + with open(file=str(file), mode="rb") as data: + blob_container_client.upload_blob(name=file_name, data=data, overwrite=True) + logger.info("Done.") + except Exception as e: + logger.info(f"Exception uploading file name {file_name}: {e}") + raise + + +def main(): + """ + Upload data files to Azure Blob Storage. + This function reads the parameters from the command line, authenticates to Azure using default credentials, + and uploads the files from a specified local folder to a specified Azure Blob Storage container. + """ + logger.info("Read and check parameters.") + # Extract the configuration parameters from the environment variables + parser = argparse.ArgumentParser(description="Parameter parser") + parser.add_argument( + "--storage_name", + required=True, + help="Azure storage account name", + ) + parser.add_argument( + "--container_name", + required=True, + help="Azure storage container name", + ) + args = parser.parse_args() + + # Validate storage account name + if not args.storage_name.islower() or not args.storage_name.isalnum(): + raise ValueError("Storage account name must be a lowercase alphanumeric string (letters and digits).") + + # Using default Azure credentials assuming that it has all needed permissions + logger.info("Authenticate code into Azure using default credentials.") + credential = DefaultAzureCredential() + + # Create the full document index + logger.info("Uploading process has been started.") + upload_data_files( + credential=credential, + storage_account_name=args.storage_name, + storage_container=args.container_name, + local_folder=os.path.dirname(__file__), + ) + logger.info("Uploading process has been completed.") + + +# This block ensures that the script runs the main function only when executed directly, +# and not when imported as a module in another script. +if __name__ == "__main__": + main() diff --git a/python/chatwithdata/data/upload_data.sh b/python/chatwithdata/data/upload_data.sh new file mode 100644 index 0000000..3faa98d --- /dev/null +++ b/python/chatwithdata/data/upload_data.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Description: +# This script uploads all PDF files from the current directory to an Azure Blob Storage container. +# It uses the Azure CLI for authentication and file uploads. + +# Usage: +# ./upload_data.sh + +# Check if the required arguments are provided +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " + exit 1 +fi + +STORAGE_ACCOUNT_NAME=$1 +CONTAINER_NAME=$2 +LOCAL_FOLDER=$(pwd) + +# Check if the container exists, and create it if it doesn't +echo "Checking if container '$CONTAINER_NAME' exists in storage account '$STORAGE_ACCOUNT_NAME'..." +CONTAINER_EXISTS=$(az storage container exists --account-name "$STORAGE_ACCOUNT_NAME" --name "$CONTAINER_NAME" --auth-mode login --query "exists" --output tsv) + +if [ "$CONTAINER_EXISTS" != "true" ]; then + echo "Container '$CONTAINER_NAME' does not exist. Creating it..." + az storage container create --account-name "$STORAGE_ACCOUNT_NAME" --name "$CONTAINER_NAME" --auth-mode login --output none + if [ $? -ne 0 ]; then + echo "Failed to create container '$CONTAINER_NAME'." + exit 1 + fi + echo "Container '$CONTAINER_NAME' created successfully." +else + echo "Container '$CONTAINER_NAME' already exists." +fi + +# Upload all PDF files from the current directory +echo "Uploading PDF files from '$LOCAL_FOLDER' to container '$CONTAINER_NAME'..." +for file in $(find "$LOCAL_FOLDER" -type f -name "*.pdf"); do + # Generate a unique blob name by replacing directory separators with underscores + BLOB_NAME=$(echo "$file" | sed "s|$LOCAL_FOLDER/||" | tr '/' '_') + echo "Uploading '$file' as blob '$BLOB_NAME'..." + az storage blob upload --account-name "$STORAGE_ACCOUNT_NAME" --container-name "$CONTAINER_NAME" --name "$BLOB_NAME" --auth-mode login --file "$file" --overwrite + if [ $? -ne 0 ]; then + echo "Failed to upload '$file'." + else + echo "Uploaded '$file' successfully." + fi +done + +echo "Upload process completed." diff --git a/python/chatwithdata/index_config/documentDataSource.json b/python/chatwithdata/index_config/documentDataSource.json new file mode 100644 index 0000000..554f144 --- /dev/null +++ b/python/chatwithdata/index_config/documentDataSource.json @@ -0,0 +1,16 @@ +{ + "name": "", + "description": null, + "type": "azureblob", + "subtype": null, + "credentials": { + "connectionString": "" + }, + "container": { + "name": "", + "query": null + }, + "dataChangeDetectionPolicy": null, + "dataDeletionDetectionPolicy": null, + "encryptionKey": null +} diff --git a/python/chatwithdata/index_config/documentIndex.json b/python/chatwithdata/index_config/documentIndex.json new file mode 100644 index 0000000..a992ab9 --- /dev/null +++ b/python/chatwithdata/index_config/documentIndex.json @@ -0,0 +1,126 @@ +{ + "name": "", + "fields": [ + { + "name": "chunk_id", + "type": "Edm.String", + "searchable": true, + "filterable": false, + "retrievable": true, + "stored": true, + "sortable": true, + "facetable": false, + "key": true, + "analyzer": "keyword", + "synonymMaps": [] + }, + { + "name": "parent_id", + "type": "Edm.String", + "searchable": false, + "filterable": true, + "retrievable": true, + "stored": true, + "sortable": false, + "facetable": false, + "key": false, + "synonymMaps": [] + }, + { + "name": "chunk", + "type": "Edm.String", + "searchable": true, + "filterable": false, + "retrievable": true, + "stored": true, + "sortable": false, + "facetable": false, + "key": false, + "synonymMaps": [] + }, + { + "name": "title", + "type": "Edm.String", + "searchable": true, + "filterable": false, + "retrievable": true, + "stored": true, + "sortable": false, + "facetable": false, + "key": false, + "synonymMaps": [] + }, + { + "name": "text_vector", + "type": "Collection(Edm.Single)", + "searchable": true, + "filterable": false, + "retrievable": true, + "stored": true, + "sortable": false, + "facetable": false, + "key": false, + "dimensions": 3072, + "vectorSearchProfile": "vector--azureOpenAi-text-profile", + "synonymMaps": [] + } + ], + "scoringProfiles": [], + "suggesters": [], + "analyzers": [], + "tokenizers": [], + "tokenFilters": [], + "charFilters": [], + "similarity": { + "@odata.type": "#Microsoft.Azure.Search.BM25Similarity" + }, + "semantic": { + "defaultConfiguration": "vector--semantic-configuration", + "configurations": [ + { + "name": "vector--semantic-configuration", + "prioritizedFields": { + "prioritizedContentFields": [ + { + "fieldName": "chunk" + } + ], + "prioritizedKeywordsFields": [] + } + } + ] + }, + "vectorSearch": { + "algorithms": [ + { + "name": "vector--algorithm", + "kind": "hnsw", + "hnswParameters": { + "metric": "cosine", + "m": 4, + "efConstruction": 400, + "efSearch": 500 + } + } + ], + "profiles": [ + { + "name": "vector--azureOpenAi-text-profile", + "algorithm": "vector--algorithm", + "vectorizer": "vector--azureOpenAi-text-vectorizer" + } + ], + "vectorizers": [ + { + "name": "vector--azureOpenAi-text-vectorizer", + "kind": "azureOpenAI", + "azureOpenAIParameters": { + "resourceUri": "", + "deploymentId": "text-embedding-3-large", + "modelName": "text-embedding-3-large" + } + } + ], + "compressions": [] + } +} diff --git a/python/chatwithdata/index_config/documentIndexer.json b/python/chatwithdata/index_config/documentIndexer.json new file mode 100644 index 0000000..f7e9bf8 --- /dev/null +++ b/python/chatwithdata/index_config/documentIndexer.json @@ -0,0 +1,28 @@ +{ + "name": "", + "description": null, + "dataSourceName": "", + "skillsetName": "", + "targetIndexName": "", + "disabled": null, + "schedule": null, + "parameters": { + "batchSize": null, + "maxFailedItems": null, + "maxFailedItemsPerBatch": null, + "base64EncodeKeys": null, + "configuration": { + "dataToExtract": "contentAndMetadata", + "parsingMode": "default" + } + }, + "fieldMappings": [ + { + "sourceFieldName": "metadata_storage_name", + "targetFieldName": "title", + "mappingFunction": null + } + ], + "outputFieldMappings": [], + "encryptionKey": null +} diff --git a/python/chatwithdata/index_config/documentSkillSet.json b/python/chatwithdata/index_config/documentSkillSet.json new file mode 100644 index 0000000..56b961c --- /dev/null +++ b/python/chatwithdata/index_config/documentSkillSet.json @@ -0,0 +1,83 @@ +{ + "name": "", + "description": "Skillset to chunk documents and generate embeddings", + "skills": [ + { + "@odata.type": "#Microsoft.Skills.Text.SplitSkill", + "name": "chunker", + "textSplitMode": "pages", + "description": null, + "context": "/document", + "inputs": [ + { + "name": "text", + "source": "/document/content" + } + ], + "outputs": [ + { + "name": "textItems", + "targetName": "chunks" + } + ], + "maxChunkSize": 512, + "maxChunkCount": 1000 + }, + { + "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill", + "name": "openAIEmbedding", + "description": "Generate embeddings using OpenAI", + "resourceUri": "", + "modelName": "text-embedding-3-large", + "deploymentId": "text-embedding-3-large", + "context": "/document/chunks/*", + "inputs": [ + { + "name": "text", + "source": "/document/chunks/*" + } + ], + "outputs": [ + { + "name": "embedding", + "targetName": "embedding" + } + ] + } + ], + "cognitiveServices": null, + "knowledgeStore": null, + "indexProjections": { + "selectors": [ + { + "targetIndexName": "", + "parentKeyFieldName": "parent_id", + "sourceContext": "/document/chunks/*", + "mappings": [ + { + "name": "chunk", + "source": "/document/chunks/*", + "sourceContext": null, + "inputs": [] + }, + { + "name": "text_vector", + "source": "/document/chunks/*/embedding", + "sourceContext": null, + "inputs": [] + }, + { + "name": "title", + "source": "/document/metadata_storage_name", + "sourceContext": null, + "inputs": [] + } + ] + } + ], + "parameters": { + "projectionMode": "skipIndexingParentDocuments" + } + }, + "encryptionKey": null + } diff --git a/python/chatwithdata/index_utils.py b/python/chatwithdata/index_utils.py new file mode 100644 index 0000000..f6397b5 --- /dev/null +++ b/python/chatwithdata/index_utils.py @@ -0,0 +1,403 @@ +""" +Utilities for managing AI Search service components. + +This module contains functions to create or update an index, indexer, skillset, and datasource. +It serves as the primary endpoint for experiments with the AI Search service. +""" + +import argparse +import logging +import os + +from azure.identity import DefaultAzureCredential +from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient +from azure.search.documents.indexes.models import ( + SearchIndex, + SearchIndexer, + SearchIndexerDataSourceConnection, + SearchIndexerSkillset, +) + +from .common_utils import absolute_url, valid_name + +logger = logging.getLogger(__name__) + +# Setting the threshold of logger to DEBUG +logger.setLevel(logging.DEBUG) + +# Create a console handler and set its level to DEBUG +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) + +# Create a formatter and set it for the console handler +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) + +# Add the console handler to the logger +logger.addHandler(console_handler) + +APPLICATION_JSON_CONTENT_TYPE = "application/json" +AI_SEARCH_API_VERSION = "2024-07-01" +INDEX_SCHEMA_PATH = os.path.join(os.path.dirname(__file__), "index_config/documentIndex.json") +DATASOURCE_SCHEMA_PATH = os.path.join(os.path.dirname(__file__), "index_config/documentDataSource.json") +SKILLSET_SCHEMA_PATH = os.path.join(os.path.dirname(__file__), "index_config/documentSkillSet.json") +INDEXER_SCHEMA_PATH = os.path.join(os.path.dirname(__file__), "index_config/documentIndexer.json") + + +def _prepare_json_schema(file_name: str, values_to_assign: dict) -> str: + """ + Create a string object that represent a json with replaced values based on the dictionary. + + Args: + file: The path to the json file + values_to_assign: a dictionary with key/value pair to change in the json file + + Returns: + str: The json string with replaced values + """ + with open(file_name) as indexer_file: + indexer_def = indexer_file.read() + + for key in values_to_assign.keys(): + indexer_def = indexer_def.replace(f"{key}", values_to_assign[key]) + + return indexer_def + + +def create_or_update_skillset( + skillset_name: str, + index_name: str, + skillset_file: str, + ai_search_uri: str, + open_ai_uri: str, + credentials: DefaultAzureCredential, +): + """ + Create or update the skillset in the AI Search service. + + Args: + skillset_name: The name of the skillset to create or update. + index_name: The name of the index to use in the skillset. + skillset_file: The path to the skillset definition file. + ai_search_uri: The URI of the AI Search service. + open_ai_uri: The base URI of the OpenAI API. + credentials: The Azure credentials to use for authentication. + + Returns: + None + """ + try: + # Create a search indexer client + indexer_client = SearchIndexerClient(ai_search_uri, credential=credentials, api_version=AI_SEARCH_API_VERSION) + + # read definition from the file and replace placeholders with actual values + definition = _prepare_json_schema( + skillset_file, + {"": index_name, "": skillset_name, "": open_ai_uri}, + ) + + # create an object of the skillset and initiate index creation process + skillset = SearchIndexerSkillset.deserialize(definition, APPLICATION_JSON_CONTENT_TYPE) + indexer_client.create_or_update_skillset(skillset=skillset) + except Exception as e: + logger.error(f"Failed to create or update the skillset '{skillset_name}': {e}") + raise + + +def create_or_update_indexer( + indexer_name: str, + index_name: str, + skillset_name: str, + datasource_name: str, + indexer_file: str, + ai_search_uri: str, + credential: DefaultAzureCredential, +): + """ + Create or update the indexer in the AI Search service. + + Args: + indexer_name: The name of the indexer to create or update. + index_name: The name of the index to use in the indexer. + skillset_name: The name of the skillset to use in the indexer. + datasource_name: The name of the data source to use in the indexer. + indexer_file: The path to the indexer definition file. + ai_search_uri: The URI of the AI Search service. + credential: The Azure credentials to use for authentication. + + Returns: + None + """ + # Create a search indexer client + try: + indexer_client = SearchIndexerClient(ai_search_uri, credential=credential, api_version=AI_SEARCH_API_VERSION) + + # read definition from the file and replace placeholders with actual values + definition = _prepare_json_schema( + indexer_file, + { + "": indexer_name, + "": index_name, + "": skillset_name, + "": datasource_name, + }, + ) + + # create an object of the indexer and initiate index creation process + indexer = SearchIndexer.deserialize(definition, APPLICATION_JSON_CONTENT_TYPE) + indexer_client.create_or_update_indexer(indexer=indexer) + except Exception as e: + logger.error(f"Failed to create or update the indexer '{indexer_name}': {e}") + raise + + +def create_or_update_datasource( + datasource_name: str, + datasource_file: str, + ai_search_uri: str, + subscription_id: str, + resource_group_name: str, + storage_account_name: str, + container_name: str, + credential: DefaultAzureCredential, +): + """ + Create or update the data source in the AI Search service. + + Args: + datasource_name: The name of the data source to create or update. + datasource_file: The path to the data source definition file. + subscription_id: The Azure subscription ID. + resource_group_name: The name of the Azure resource group. + storage_account_name: The name of the Azure storage account. + container_name: The name of the Azure storage container. + ai_search_uri: The URI of the AI Search service. + credential: The Azure credentials to use for authentication. + + Returns: + None + """ + try: + # Create the connection string for the storage account applying Entra ID approach + # The connection string is in the format: "ResourceId=/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Storage/storageAccounts/{storage_account_name};" + conn_string = _get_storage_conn_string(subscription_id, storage_account_name, resource_group_name) + + # Create a search indexer client + indexer_client = SearchIndexerClient(ai_search_uri, credential=credential, api_version=AI_SEARCH_API_VERSION) + + # read definition from the file and replace placeholders with actual values + definition = _prepare_json_schema( + datasource_file, + { + "": conn_string, + "": container_name, + "": datasource_name, + }, + ) + + # create an object of the data source connection and initiate data source creation process + data_source_connection = SearchIndexerDataSourceConnection.deserialize( + definition, APPLICATION_JSON_CONTENT_TYPE + ) + + # Explicitly setting the connection string as it is required by the SearchIndexerDataSourceConnection object + # to properly establish the connection, even though credentials are provided. + data_source_connection.connection_string = conn_string + + # Create or update the data source + indexer_client.create_or_update_data_source_connection(data_source_connection) + except Exception as e: + logger.error(f"Failed to create or update the data source '{datasource_name}': {e}") + raise + + +def _get_storage_conn_string( + subscription_id: str, + storage_account_name: str, + resource_group_name: str, +) -> str: + conn_string = ( + f"ResourceId=/subscriptions/{subscription_id}" + f"/resourceGroups/{resource_group_name}/providers/Microsoft.Storage" + f"/storageAccounts/{storage_account_name};" + ) + + return conn_string + + +def create_or_update_index( + index_name: str, + index_file: str, + ai_search_uri: str, + open_ai_uri: str, + credential: DefaultAzureCredential, +): + """ + Create or update the index in the AI Search service. + + Args: + index_name: The name of the index to create or update. + index_file: The path to the index definition file. + open_ai_uri: The base URI of the OpenAI API. + ai_search_uri: The URI of the AI Search service. + credential: The Azure credentials to use for authentication. + + Returns: + None + """ + try: + index_client = SearchIndexClient(ai_search_uri, credential=credential, api_version=AI_SEARCH_API_VERSION) + + definition = _prepare_json_schema( + index_file, + { + "": index_name, + "": open_ai_uri, + }, + ) + + # create an object of the index and initiate index creation process + index = SearchIndex.deserialize(definition, APPLICATION_JSON_CONTENT_TYPE) + index_client.create_or_update_index(index=index) + except Exception as e: + logger.error(f"Failed to create or update the index '{index_name}': {e}") + raise + + +def main(): + """ + Create an indexer and related entities based on the configuration parameters. + + This function serves as the entry point for the script. It reads configuration parameters + from command-line arguments, authenticates with Azure using default credentials, and + orchestrates the creation or update of the following AI Search service components: + + - Search Index: Defines the structure of the searchable content. + - Data Source: Specifies the source of the data to be indexed. + - Skillset: Defines the AI enrichment pipeline for the data. + - Indexer: Manages the process of pulling data from the data source, applying the skillset, + and populating the search index. + + The function expects the following command-line arguments: + - --aisearch_name: The name of the AI Search service. + - --base_index_name: The base name used to generate names for the index, data source, skillset, and indexer. + - --openai_api_base: The base URL of the OpenAI API. + - --subscription_id: The Azure subscription ID. + - --resource_group_name: The name of the Azure resource group. + - --storage_name: The name of the Azure storage account. + - --container_name: The name of the Azure storage container. + + The function uses these parameters to construct the necessary components and logs the progress + of each operation. + """ + logger.info("Read and check parameters.") + # Extract the configuration parameters from the environment variables + parser = argparse.ArgumentParser(description="Parameter parser") + parser.add_argument( + "--aisearch_name", + required=True, + type=valid_name, + help="name of the AI Search service", + ) + parser.add_argument( + "--base_index_name", + required=True, + type=valid_name, + help="base name to form the index, data source, skillset and indexer names", + ) + parser.add_argument( + "--openai_api_base", + type=absolute_url, + required=True, + help="base URL of the OpenAI API", + ) + parser.add_argument( + "--subscription_id", + type=valid_name, + required=True, + help="Azure subscription ID", + ) + parser.add_argument( + "--resource_group_name", + type=valid_name, + required=True, + help="Azure resource group name", + ) + parser.add_argument( + "--storage_name", + type=valid_name, + required=True, + help="Azure storage account name", + ) + parser.add_argument( + "--container_name", + type=valid_name, + required=True, + help="Azure storage container name", + ) + args = parser.parse_args() + + # Using default Azure credentials assuming that it has all needed permissions + logger.info("Authenticate code into Azure using default credentials.") + credential = DefaultAzureCredential() + + ai_search_uri = f"https://{args.aisearch_name}.search.windows.net" + + # forming entity names based on the base name + index_name = f"{args.base_index_name}-index" + datasource_name = f"{args.base_index_name}-ds" + skillset_name = f"{args.base_index_name}-skills" + indexer_name = f"{args.base_index_name}-indexer" + + # Create the full document index + logger.info("Initiate index creation method.") + create_or_update_index( + index_name, + INDEX_SCHEMA_PATH, + ai_search_uri, + args.openai_api_base, + credential, + ) + logger.info("Index creation completed.") + + logger.info("Initiate data source creation method.") + create_or_update_datasource( + datasource_name, + DATASOURCE_SCHEMA_PATH, + ai_search_uri, + args.subscription_id, + args.resource_group_name, + args.storage_name, + args.container_name, + credential, + ) + logger.info("Data source creation completed.") + + logger.info("Initiate skillset creation method.") + create_or_update_skillset( + skillset_name, + index_name, + SKILLSET_SCHEMA_PATH, + ai_search_uri, + args.openai_api_base, + credential, + ) + logger.info("Skillset creation completed.") + + logger.info("Initiate indexer creation method.") + create_or_update_indexer( + indexer_name, + index_name, + skillset_name, + datasource_name, + INDEXER_SCHEMA_PATH, + ai_search_uri, + credential, + ) + logger.info("Indexer creation completed.") + + +# This block ensures that the script runs the main function only when executed directly, +# and not when imported as a module in another script. +if __name__ == "__main__": + main() diff --git a/python/chatwithdata/readme.md b/python/chatwithdata/readme.md new file mode 100644 index 0000000..c4e5e6a --- /dev/null +++ b/python/chatwithdata/readme.md @@ -0,0 +1,71 @@ +# How to Set Up Azure Resources +Create following three resources in your Azure Resource Group: +* Azure AI Search +* Azure Blob Storage +* Azure AI Foundry + + +# 1. Azure AI Search +* Select Identity and enable System Assigned Managed Identity for Azure Search Service. + +* Go to Access Control (IAM) assign following role: +- Search Index Data Contributor +>**Note** Select Managed Identity for your user account. + +# 2. Azure Blob Storage +* In Access Control (IAM) assign two following Role Permissions: + +- Storage Blob Data Contributor +> **Note**: Select Managed Identity for the Azure AI Search resource + +- Storage Blob Data Contributo +> **Note**: Select Your User Account as Role assignment + +# 3. Azure AI Foundry +* In Access Control (IAM) assign: +- Cognitive Services OpenAI User +> **Note**: Select Managed Identity for the Azure AI Search resource + + +# Azure AI Search Python Scripts + +These scripts are designed to assist in creating and managing Azure AI Search components, including indexes, indexers, data sources, and skillsets (if required). They streamline the setup process, enabling efficient configuration and deployment of search capabilities. + +## How to test the scripts locally + +To test the scripts locally, follow these steps to create and activate a new virtual environment: + +```bash +# Create a new virtual environment with Python 3.12 +python -m venv venv + +# Activate the newly created environment +source venv/bin/activate # On Windows, use: venv\Scripts\activate +``` + +Next, install the required dependencies (find requirements.txt in the src/search folder): + +```bash +pip install -r requirements.txt +``` + +Log in to your Azure account to use your credentials in the code: + +```bash +az login -t +``` + +Step 1: Upload data to blob storage +```bash +cd python + +python -m data.upload_data --storage_name --container_name +``` + +Step 2: Create Index +```bash +# Modify the path according to your current folder +python -m src.index_utils --aisearch_name --base_index_name --openai_api_base --subscription_id --resource_group_name --storage_name --container_name +``` + +The `base_index_name` parameter simplifies the script configuration by reducing the number of required parameters. The script automatically generates names for the index, skillset, indexer, and data source by appending the suffixes `-index`, `-skills`, `-indexer`, and `-ds` to the provided base name. diff --git a/python/chatwithdata/requirements.txt b/python/chatwithdata/requirements.txt new file mode 100644 index 0000000..72ef3d6 --- /dev/null +++ b/python/chatwithdata/requirements.txt @@ -0,0 +1,3 @@ +azure-identity>=1.16.1 +azure-search-documents==11.6.0b5 +azure-storage-blob==12.25.1