diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index 9cb96e4f7..591feeeef 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -94,7 +94,7 @@ def pytest_sessionstart(): # Wait for OLS to be ready print(f"Waiting for OLS to be ready at url: {ols_url} with provider: {provider}...") - OLS_READY = wait_for_ols(ols_url) + OLS_READY = wait_for_ols(ols_url, pytest.client) print(f"OLS is ready: {OLS_READY}") # Gather OLS artifacts in case OLS does not become ready if on_cluster and not OLS_READY: @@ -131,6 +131,17 @@ def pytest_runtest_makereport(item, call) -> TestReport: return TestReport.from_item_and_call(item, call) +def pytest_collection_modifyitems(config, items): + """Skip tests marked with @pytest.mark.skip_with_lcore when LCORE is enabled.""" + lcore_enabled = os.getenv("LCORE", "False").lower() in ("true", "1", "t") + + if lcore_enabled: + skip_lcore = pytest.mark.skip(reason="LCORE environment variable is enabled") + for item in items: + if "skip_with_lcore" in item.keywords: + item.add_marker(skip_lcore) + + def pytest_addoption(parser): """Argument parser for pytest.""" parser.addoption( diff --git a/tests/e2e/pytest.ini b/tests/e2e/pytest.ini index 8bc377654..0e3bcc8d5 100644 --- a/tests/e2e/pytest.ini +++ b/tests/e2e/pytest.ini @@ -17,3 +17,4 @@ markers = byok2 quota_limits data_export + skip_with_lcore: marks tests to skip when LCORE environment variable is enabled diff --git a/tests/e2e/test_api.py b/tests/e2e/test_api.py index 2f4178753..2c102d243 100644 --- a/tests/e2e/test_api.py +++ b/tests/e2e/test_api.py @@ -5,6 +5,7 @@ # pyright: reportAttributeAccessIssue=false import json +import os import re import time @@ -24,11 +25,13 @@ CONVERSATION_ID, LLM_REST_API_TIMEOUT, NON_LLM_REST_API_TIMEOUT, + OLS_SERVICE_DEPLOYMENT, OLS_USER_DATA_COLLECTION_INTERVAL_SHORT, OLS_USER_DATA_PATH, ) from tests.e2e.utils.data_collector_control import prepare_for_data_collection_test from tests.e2e.utils.decorators import retry +from tests.e2e.utils.ols_installer import update_ols_config from tests.e2e.utils.postgres import ( read_conversation_history, read_conversation_history_count, @@ -51,7 +54,17 @@ def test_readiness(): response = pytest.client.get(endpoint, timeout=LLM_REST_API_TIMEOUT) assert response.status_code == requests.codes.ok response_utils.check_content_type(response, "application/json") - assert response.json() == {"ready": True, "reason": "service is ready"} + if os.getenv("LCORE", "False").lower() in ("true", "1", "t"): + assert response.json() == { + "ready": True, + "reason": "All providers are healthy", + "providers": [], + } + else: + assert response.json() == { + "ready": True, + "reason": "service is ready", + } @pytest.mark.smoketest @@ -82,6 +95,16 @@ def test_metrics() -> None: "ols_llm_token_received_total", "ols_provider_model_configuration", ) + if os.getenv("LCORE", "False").lower() in ("true", "1", "t"): + expected_counters = ( + "ls_rest_api_calls_total", + "ls_llm_calls_total", + "ls_llm_calls_failures_total", + "ls_llm_validation_errors_total", + "ls_llm_token_sent_total", + "ls_llm_token_received_total", + "ls_provider_model_configuration", + ) # check if all counters are present for expected_counter in expected_counters: @@ -92,12 +115,12 @@ def test_metrics() -> None: assert 'response_duration_seconds_sum{path="/metrics"}' in response.text +@pytest.mark.skip_with_lcore def test_model_provider(): """Read configured model and provider from metrics.""" model, provider = metrics_utils.get_enabled_model_and_provider( pytest.metrics_client ) - # enabled model must be one of our expected combinations assert model, provider in { ("gpt-4o-mini", "openai"), @@ -106,6 +129,7 @@ def test_model_provider(): } +@pytest.mark.skip_with_lcore def test_one_default_model_provider(): """Check if one model and provider is selected as default.""" states = metrics_utils.get_enable_status_for_all_models(pytest.metrics_client) @@ -124,9 +148,13 @@ def test_improper_token(): timeout=NON_LLM_REST_API_TIMEOUT, headers={"Authorization": "Bearer wrong-token"}, ) - assert response.status_code == requests.codes.forbidden + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + assert response.status_code == requests.codes.forbidden + else: + assert response.status_code == requests.codes.unauthorized +@pytest.mark.skip_with_lcore @pytest.mark.cluster def test_forbidden_user(): """Test scenarios where we expect an unauthorized response. @@ -149,7 +177,7 @@ def test_transcripts_storing_cluster(): """Test if the transcripts are stored properly.""" transcripts_path = OLS_USER_DATA_PATH + "/transcripts" cluster_utils.wait_for_running_pod() - pod_name = cluster_utils.get_pod_by_prefix()[0] + pod_name = cluster_utils.get_pod_by_prefix(OLS_SERVICE_DEPLOYMENT)[0] # there are multiple tests running agains cluster, so transcripts # can be already present - we need to ensure the storage is empty @@ -175,7 +203,6 @@ def test_transcripts_storing_cluster(): timeout=LLM_REST_API_TIMEOUT, ) assert response.status_code == requests.codes.ok - transcript = cluster_utils.get_single_existing_transcript( pod_name, transcripts_path ) @@ -209,6 +236,7 @@ def test_transcripts_storing_cluster(): assert transcript["tool_calls"] == [] +@pytest.mark.skip_with_lcore @retry(max_attempts=3, wait_between_runs=10) def test_openapi_endpoint(): """Test handler for /opanapi REST API endpoint.""" @@ -252,7 +280,6 @@ def test_cache_existence(postgres_connection): """Test the cache existence.""" if postgres_connection is None: pytest.skip("Postgres is not accessible.") - value = read_conversation_history_count(postgres_connection) # check if history exists at all assert value is not None @@ -309,6 +336,33 @@ def test_conversation_in_postgres_cache(postgres_connection) -> None: assert "OpenShift" in deserialized[3].content +@pytest.fixture +def turn_off_operator_pod(): + """Turn off operator pod fixture. + + Turn off operator pod to modify lightspeed-stack + without waiting for lightspeed service pod to restart. + """ + cluster_utils.run_oc( + [ + "scale", + "deployment/lightspeed-operator-controller-manager", + "--replicas", + "0", + ] + ) + yield + cluster_utils.run_oc( + [ + "scale", + "deployment/lightspeed-operator-controller-manager", + "--replicas", + "1", + ] + ) + + +@pytest.mark.usefixtures("turn_off_operator_pod") @pytest.mark.data_export def test_user_data_collection(): """Test user data collection and upload to ingress. @@ -316,6 +370,7 @@ def test_user_data_collection(): This test runs in isolation with the 'data_export' marker. It patches the exporter to use manual mode so it uses the ConfigMap token. """ + update_ols_config() def filter_logs(logs: str, last_log_line: str) -> str: filtered_logs = [] @@ -333,7 +388,8 @@ def get_last_log_line(logs: str) -> str: # Prepare: patch to manual mode, set short interval, configure stage ingress controller = prepare_for_data_collection_test( - short_interval_seconds=OLS_USER_DATA_COLLECTION_INTERVAL_SHORT + client=pytest.client, + short_interval_seconds=OLS_USER_DATA_COLLECTION_INTERVAL_SHORT, ) data_collection_container_name = "lightspeed-to-dataverse-exporter" @@ -361,7 +417,6 @@ def get_last_log_line(logs: str) -> str: # Get log point for next check last_log_line = get_last_log_line(container_log) - # Create new data via feedback endpoint response = pytest.client.post( "/v1/feedback", @@ -397,6 +452,7 @@ def get_last_log_line(logs: str) -> str: assert user_data == [] +@pytest.mark.skip_with_lcore @pytest.mark.cluster def test_http_header_redaction(): """Test that sensitive HTTP headers are redacted from the logs.""" @@ -479,7 +535,7 @@ def test_ca_service_certs_rotation(): name="lightspeed-operator-controller-manager", namespace="openshift-lightspeed" ) cluster_utils.restart_deployment( - name="lightspeed-app-server", namespace="openshift-lightspeed" + name=OLS_SERVICE_DEPLOYMENT, namespace="openshift-lightspeed" ) cluster_utils.restart_deployment( name="lightspeed-console-plugin", namespace="openshift-lightspeed" @@ -503,17 +559,35 @@ def update_olsconfig(limiters: list[dict]): limiters: List of dictionaries containing limiter configurations to set in ols_config.quota_handlers.limiters """ - configmap_yaml = cluster_utils.run_oc(["get", "cm/olsconfig", "-o", "yaml"]).stdout - configmap = yaml.safe_load(configmap_yaml) - olsconfig = yaml.safe_load(configmap["data"][DEFAULT_CONFIGURATION_FILE]) - olsconfig["ols_config"]["quota_handlers"]["limiters"] = limiters - configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) - updated_configmap = yaml.dump(configmap) + is_lcore = os.getenv("LCORE", "False").lower() in ("true", "1", "t") + if is_lcore: + # LCORE environment: update lightspeed-stack-config ConfigMap + configmap_name = "lightspeed-stack-config" + config_file_key = "lightspeed-stack.yaml" + configmap_yaml = cluster_utils.run_oc( + ["get", f"cm/{configmap_name}", "-o", "yaml"] + ).stdout + configmap = yaml.safe_load(configmap_yaml) + stack_config = yaml.safe_load(configmap["data"][config_file_key]) + stack_config["quota_handlers"]["limiters"] = limiters + configmap["data"][config_file_key] = yaml.dump(stack_config) + else: + # Standard environment: update olsconfig ConfigMap + configmap_name = "olsconfig" + configmap_yaml = cluster_utils.run_oc( + ["get", f"cm/{configmap_name}", "-o", "yaml"] + ).stdout + configmap = yaml.safe_load(configmap_yaml) + olsconfig = yaml.safe_load(configmap["data"][DEFAULT_CONFIGURATION_FILE]) + olsconfig["ols_config"]["quota_handlers"]["limiters"] = limiters + configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) - cluster_utils.run_oc(["delete", "configmap", "olsconfig"]) + updated_configmap = yaml.dump(configmap) + cluster_utils.run_oc(["delete", "configmap", configmap_name]) cluster_utils.run_oc(["apply", "-f", "-"], command=updated_configmap) +@pytest.mark.usefixtures("turn_off_operator_pod") @pytest.mark.quota_limits def test_quota_limits(): """Verify OLS quota limits.""" @@ -522,7 +596,6 @@ def test_quota_limits(): json={"query": "what is kubernetes?"}, timeout=LLM_REST_API_TIMEOUT, ) - # assert that the available quota is # less than the initial one hardcoded in the olsconfig assert ( @@ -539,7 +612,7 @@ def test_quota_limits(): cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "0", ] @@ -558,7 +631,7 @@ def test_quota_limits(): cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "1", ] @@ -580,7 +653,7 @@ def test_quota_limits(): cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "0", ] @@ -589,7 +662,7 @@ def test_quota_limits(): cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "1", ] diff --git a/tests/e2e/test_attachments.py b/tests/e2e/test_attachments.py index 0e92c0df0..842ddcd20 100644 --- a/tests/e2e/test_attachments.py +++ b/tests/e2e/test_attachments.py @@ -4,6 +4,8 @@ # properly by linters # pyright: reportAttributeAccessIssue=false +import os + import pytest import requests @@ -18,7 +20,6 @@ def test_valid_question_with_empty_attachment_list() -> None: """Check the REST API /v1/query with POST HTTP method using empty attachment list.""" endpoint = "/v1/query" - with metrics_utils.RestAPICallCounterChecker( pytest.metrics_client, endpoint, status_code=requests.codes.ok ): @@ -232,15 +233,28 @@ def test_valid_question_with_wrong_attachment_format_unknown_attachment_type() - # the attachment should not be processed correctly assert response.status_code == requests.codes.unprocessable_entity - json_response = response.json() - expected_response = { - "detail": { - "response": "Unable to process this request", - "cause": "Attachment with improper type unknown_type detected", + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + expected_response = { + "detail": { + "response": "Unable to process this request", + "cause": "Attachment with improper type unknown_type detected", + } } - } - assert json_response == expected_response + assert json_response == expected_response + else: + assert "Invalid attribute value" in json_response["detail"]["response"] + assert ( + "Invalid attatchment type unknown_type: must be one of frozenset" + in json_response["detail"]["cause"] + ) + assert "event" in json_response["detail"]["cause"] + assert "log" in json_response["detail"]["cause"] + assert "stack trace" in json_response["detail"]["cause"] + assert "alert" in json_response["detail"]["cause"] + assert "configuration" in json_response["detail"]["cause"] + assert "api object" in json_response["detail"]["cause"] + assert "error message" in json_response["detail"]["cause"] @retry(max_attempts=3, wait_between_runs=10) @@ -271,12 +285,18 @@ def test_valid_question_with_wrong_attachment_format_unknown_content_type() -> N # the attachment should not be processed correctly assert response.status_code == requests.codes.unprocessable_entity - json_response = response.json() - expected_response = { - "detail": { - "response": "Unable to process this request", - "cause": "Attachment with improper content type unknown/type detected", + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + expected_response = { + "detail": { + "response": "Unable to process this request", + "cause": "Attachment with improper content type unknown/type detected", + } } - } - assert json_response == expected_response + assert json_response == expected_response + else: + assert "Invalid attribute value" in json_response["detail"]["response"] + assert "application/json" in json_response["detail"]["cause"] + assert "application/xml" in json_response["detail"]["cause"] + assert "application/yaml" in json_response["detail"]["cause"] + assert "text/plain" in json_response["detail"]["cause"] diff --git a/tests/e2e/test_query_endpoint.py b/tests/e2e/test_query_endpoint.py index d629c62ed..a7eb4ad10 100644 --- a/tests/e2e/test_query_endpoint.py +++ b/tests/e2e/test_query_endpoint.py @@ -4,6 +4,7 @@ # properly by linters # pyright: reportAttributeAccessIssue=false +import os import re import pytest @@ -20,6 +21,7 @@ QUERY_ENDPOINT = "/v1/query" +@pytest.mark.skip_with_lcore def test_invalid_question(): """Check the REST API /v1/query with POST HTTP method for invalid question.""" with metrics_utils.RestAPICallCounterChecker(pytest.metrics_client, QUERY_ENDPOINT): @@ -77,11 +79,11 @@ def test_invalid_question_without_conversation_id(): json_response["response"], re.IGNORECASE, ) - - # new conversation ID should be generated - assert suid.check_suid( - json_response["conversation_id"] - ), "Conversation ID is not in UUID format" + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + # new conversation ID should be generated + assert suid.check_suid( + json_response["conversation_id"] + ), "Conversation ID is not in UUID format" def test_query_call_without_payload(): @@ -125,6 +127,7 @@ def test_query_call_with_improper_payload(): assert "missing" in response.text +@pytest.mark.skip_with_lcore def test_valid_question_improper_conversation_id() -> None: """Check the REST API /v1/query with POST HTTP method for improper conversation ID.""" with metrics_utils.RestAPICallCounterChecker( @@ -150,6 +153,7 @@ def test_valid_question_improper_conversation_id() -> None: assert json_response == expected_response +@pytest.mark.skip_with_lcore @retry(max_attempts=3, wait_between_runs=10) def test_valid_question_missing_conversation_id() -> None: """Check the REST API /v1/query with POST HTTP method for missing conversation ID.""" @@ -175,6 +179,7 @@ def test_valid_question_missing_conversation_id() -> None: ), "Conversation ID is not in UUID format" +@pytest.mark.skip_with_lcore def test_too_long_question() -> None: """Check the REST API /v1/query with too long question.""" # let's make the query really large, larger that context window size @@ -205,23 +210,37 @@ def test_too_long_question() -> None: def test_valid_question() -> None: """Check the REST API /v1/query with POST HTTP method for valid question and no yaml.""" with metrics_utils.RestAPICallCounterChecker(pytest.metrics_client, QUERY_ENDPOINT): - cid = suid.get_suid() - response = pytest.client.post( - QUERY_ENDPOINT, - json={ - "conversation_id": cid, - "query": "what is kubernetes in the context of OpenShift?", - }, - timeout=test_api.LLM_REST_API_TIMEOUT, - ) - assert response.status_code == requests.codes.ok - - response_utils.check_content_type(response, "application/json") - print(vars(response)) - json_response = response.json() - - # checking a few major information from response - assert json_response["conversation_id"] == cid + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + cid = suid.get_suid() + response = pytest.client.post( + QUERY_ENDPOINT, + json={ + "conversation_id": cid, + "query": "what is kubernetes in the context of OpenShift?", + }, + timeout=test_api.LLM_REST_API_TIMEOUT, + ) + assert response.status_code == requests.codes.ok + + response_utils.check_content_type(response, "application/json") + print(vars(response)) + json_response = response.json() + + # checking a few major information from response + assert json_response["conversation_id"] == cid + else: + response = pytest.client.post( + QUERY_ENDPOINT, + json={ + "query": "what is kubernetes in the context of OpenShift?", + }, + timeout=test_api.LLM_REST_API_TIMEOUT, + ) + assert response.status_code == requests.codes.ok + + response_utils.check_content_type(response, "application/json") + print(vars(response)) + json_response = response.json() assert re.search( r"kubernetes|openshift", json_response["response"], @@ -256,6 +275,7 @@ def test_ocp_docs_version_same_as_cluster_version() -> None: assert f"{major}.{minor}" in json_response["referenced_documents"][0]["doc_url"] +@pytest.mark.skip_with_lcore def test_valid_question_tokens_counter() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -275,6 +295,7 @@ def test_valid_question_tokens_counter() -> None: response_utils.check_content_type(response, "application/json") +@pytest.mark.skip_with_lcore def test_invalid_question_tokens_counter() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -294,6 +315,7 @@ def test_invalid_question_tokens_counter() -> None: response_utils.check_content_type(response, "application/json") +@pytest.mark.skip_with_lcore def test_token_counters_for_query_call_without_payload() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -322,6 +344,7 @@ def test_token_counters_for_query_call_without_payload() -> None: response_utils.check_content_type(response, "application/json") +@pytest.mark.skip_with_lcore def test_token_counters_for_query_call_with_improper_payload() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -378,6 +401,7 @@ def test_rag_question() -> None: assert len(doc_urls_list) == len(set(doc_urls_list)) +@pytest.mark.skip_with_lcore @pytest.mark.cluster def test_query_filter() -> None: """Ensure responses does not include filtered words and redacted words are not logged.""" @@ -417,7 +441,7 @@ def test_query_filter() -> None: continue # check that the pattern is indeed not found in logs for pattern in unwanted_patterns: - assert pattern not in line.lower() + assert pattern not in line.lower(), f"failed for {pattern}" # Ensure the intended redaction has occurred assert "what is deployment in openshift?" in container_log @@ -461,6 +485,7 @@ def test_conversation_history() -> None: assert "ingress" in response_text, debug_msg +@pytest.mark.skip_with_lcore def test_query_with_provider_but_not_model() -> None: """Check the REST API /v1/query with POST HTTP method for provider specified, but no model.""" with metrics_utils.RestAPICallCounterChecker( @@ -490,6 +515,7 @@ def test_query_with_provider_but_not_model() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_model_but_not_provider() -> None: """Check the REST API /v1/query with POST HTTP method for model specified, but no provider.""" with metrics_utils.RestAPICallCounterChecker( @@ -518,6 +544,7 @@ def test_query_with_model_but_not_provider() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_unknown_provider() -> None: """Check the REST API /v1/query with POST HTTP method for unknown provider specified.""" # retrieve currently selected model @@ -555,6 +582,7 @@ def test_query_with_unknown_provider() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_unknown_model() -> None: """Check the REST API /v1/query with POST HTTP method for unknown model specified.""" # retrieve currently selected provider diff --git a/tests/e2e/test_streaming_query_endpoint.py b/tests/e2e/test_streaming_query_endpoint.py index c3a67790b..8718de044 100644 --- a/tests/e2e/test_streaming_query_endpoint.py +++ b/tests/e2e/test_streaming_query_endpoint.py @@ -5,6 +5,7 @@ # pyright: reportAttributeAccessIssue=false import json +import os import re import pytest @@ -53,6 +54,7 @@ def construct_response_from_streamed_events(events: dict) -> str: return response +@pytest.mark.skip_with_lcore def test_invalid_question(): """Check the endpoint POST method for invalid question.""" with metrics_utils.RestAPICallCounterChecker( @@ -98,7 +100,8 @@ def test_invalid_question_without_conversation_id(): # new conversation ID should be generated assert events[0]["event"] == "start" assert events[0]["data"] - assert suid.check_suid(events[0]["data"]["conversation_id"]) + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + assert suid.check_suid(events[0]["data"]["conversation_id"]) def test_query_call_without_payload(): @@ -139,6 +142,7 @@ def test_query_call_with_improper_payload(): assert "missing" in response.text +@pytest.mark.skip_with_lcore def test_valid_question_improper_conversation_id() -> None: """Check the endpoint with POST HTTP method for improper conversation ID.""" with metrics_utils.RestAPICallCounterChecker( @@ -163,6 +167,7 @@ def test_valid_question_improper_conversation_id() -> None: assert json_response == expected_response +@pytest.mark.skip_with_lcore def test_too_long_question() -> None: """Check the endpoint with too long question.""" # let's make the query really large, larger that context window size @@ -200,11 +205,24 @@ def test_valid_question() -> None: with metrics_utils.RestAPICallCounterChecker( pytest.metrics_client, STREAMING_QUERY_ENDPOINT ): - cid = suid.get_suid() - response = post_with_defaults( - STREAMING_QUERY_ENDPOINT, - json={"conversation_id": cid, "query": "what is kubernetes?"}, - ) + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + cid = suid.get_suid() + response = pytest.client.post( + STREAMING_QUERY_ENDPOINT, + json={ + "conversation_id": cid, + "query": "what is kubernetes in the context of OpenShift?", + }, + timeout=test_api.LLM_REST_API_TIMEOUT, + ) + else: + response = pytest.client.post( + STREAMING_QUERY_ENDPOINT, + json={ + "query": "what is kubernetes in the context of OpenShift?", + }, + timeout=test_api.LLM_REST_API_TIMEOUT, + ) assert response.status_code == requests.codes.ok response_utils.check_content_type(response, constants.MEDIA_TYPE_TEXT) @@ -244,6 +262,7 @@ def test_ocp_docs_version_same_as_cluster_version() -> None: ) +@pytest.mark.skip_with_lcore def test_valid_question_tokens_counter() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -264,6 +283,7 @@ def test_valid_question_tokens_counter() -> None: response_utils.check_content_type(response, constants.MEDIA_TYPE_TEXT) +@pytest.mark.skip_with_lcore def test_invalid_question_tokens_counter() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -284,6 +304,7 @@ def test_invalid_question_tokens_counter() -> None: response_utils.check_content_type(response, constants.MEDIA_TYPE_TEXT) +@pytest.mark.skip_with_lcore def test_token_counters_for_query_call_without_payload() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -311,6 +332,7 @@ def test_token_counters_for_query_call_without_payload() -> None: response_utils.check_content_type(response, constants.MEDIA_TYPE_JSON) +@pytest.mark.skip_with_lcore def test_token_counters_for_query_call_with_improper_payload() -> None: """Check how the tokens counter are updated accordingly.""" model, provider = metrics_utils.get_enabled_model_and_provider( @@ -372,6 +394,7 @@ def test_rag_question() -> None: assert len(set(docs_urls)) == len(docs_urls) +@pytest.mark.skip_with_lcore @pytest.mark.cluster def test_query_filter() -> None: """Ensure responses does not include filtered words and redacted words are not logged.""" @@ -463,6 +486,7 @@ def test_conversation_history() -> None: assert "ingress" in response_text, scenario_fail_msg +@pytest.mark.skip_with_lcore def test_query_with_provider_but_not_model() -> None: """Check the endpoint with POST HTTP method for provider specified, but no model.""" with metrics_utils.RestAPICallCounterChecker( @@ -491,6 +515,7 @@ def test_query_with_provider_but_not_model() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_model_but_not_provider() -> None: """Check the endpoint with POST HTTP method for model specified, but no provider.""" with metrics_utils.RestAPICallCounterChecker( @@ -518,6 +543,7 @@ def test_query_with_model_but_not_provider() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_unknown_provider() -> None: """Check the endpoint with POST HTTP method for unknown provider specified.""" # retrieve currently selected model @@ -554,6 +580,7 @@ def test_query_with_unknown_provider() -> None: ) +@pytest.mark.skip_with_lcore def test_query_with_unknown_model() -> None: """Check the endpoint with POST HTTP method for unknown model specified.""" # retrieve currently selected provider diff --git a/tests/e2e/test_user_feedback.py b/tests/e2e/test_user_feedback.py index ed3a4c9a8..bbc584dff 100644 --- a/tests/e2e/test_user_feedback.py +++ b/tests/e2e/test_user_feedback.py @@ -4,11 +4,14 @@ # properly by linters # pyright: reportAttributeAccessIssue=false +import os + import pytest import requests from tests.e2e.utils import cluster as cluster_utils from tests.e2e.utils import response as response_utils +from tests.e2e.utils.constants import OLS_SERVICE_DEPLOYMENT from . import test_api @@ -27,14 +30,17 @@ def test_feedback_can_post_with_wrong_token(): timeout=test_api.BASIC_ENDPOINTS_TIMEOUT, headers={"Authorization": "Bearer wrong-token"}, ) - assert response.status_code == requests.codes.forbidden + if os.getenv("LCORE", "False").lower() not in ("true", "1", "t"): + assert response.status_code == requests.codes.forbidden + else: + assert response.status_code == requests.codes.unauthorized @pytest.mark.data_export def test_feedback_storing_cluster(): """Test if the feedbacks are stored properly.""" feedbacks_path = test_api.OLS_USER_DATA_PATH + "/feedback" - pod_name = cluster_utils.get_pod_by_prefix()[0] + pod_name = cluster_utils.get_pod_by_prefix(OLS_SERVICE_DEPLOYMENT)[0] # there are multiple tests running agains cluster, so transcripts # can be already present - we need to ensure the storage is empty diff --git a/tests/e2e/utils/adapt_ols_config.py b/tests/e2e/utils/adapt_ols_config.py index c57914dfd..39d7fbe95 100644 --- a/tests/e2e/utils/adapt_ols_config.py +++ b/tests/e2e/utils/adapt_ols_config.py @@ -9,10 +9,20 @@ import yaml from ols.constants import DEFAULT_CONFIGURATION_FILE +from tests.e2e.utils import client as client_utils from tests.e2e.utils import cluster as cluster_utils +from tests.e2e.utils.constants import OLS_SERVICE_DEPLOYMENT, LCORE_ENABLED from tests.e2e.utils.data_collector_control import configure_exporter_for_e2e_tests +from tests.e2e.utils.ols_installer import ( + create_secrets, + get_service_account_tokens, + setup_rbac, + setup_route, + setup_service_accounts, + update_lcore_setting, + update_ols_config, +) from tests.e2e.utils.retry import retry_until_timeout_or_success -from tests.e2e.utils.wait_for_ols import wait_for_ols def apply_olsconfig(provider_list: list[str]) -> None: @@ -49,28 +59,21 @@ def update_ols_configmap() -> None: """Update OLS configmap with additional e2e test configurations. Configures logging levels and user data collector settings for testing. + This is a wrapper around update_ols_config that adds data_export specific settings. """ - try: - # Get the current configmap - configmap_yaml = cluster_utils.run_oc( - ["get", "cm/olsconfig", "-o", "yaml"] - ).stdout - configmap = yaml.safe_load(configmap_yaml) - olsconfig = yaml.safe_load(configmap["data"][DEFAULT_CONFIGURATION_FILE]) - - # Ensure proper logging config for e2e tests - if "ols_config" not in olsconfig: - olsconfig["ols_config"] = {} - if "logging_config" not in olsconfig["ols_config"]: - olsconfig["ols_config"]["logging_config"] = {} - - # Set INFO level to avoid redacted logs - olsconfig["ols_config"]["logging_config"]["lib_log_level"] = "INFO" - - # Configure user data collection only for data_export test suite - # Other test suites don't need it and the volume might not be mounted - ols_config_suffix = os.getenv("OLS_CONFIG_SUFFIX", "default") - if ols_config_suffix == "data_export": + # First apply the standard config updates + update_ols_config() + + # Then add data_export specific user data collection config if needed + ols_config_suffix = os.getenv("OLS_CONFIG_SUFFIX", "default") + if ols_config_suffix == "data_export": + try: + configmap_yaml = cluster_utils.run_oc( + ["get", "cm/olsconfig", "-o", "yaml"] + ).stdout + configmap = yaml.safe_load(configmap_yaml) + olsconfig = yaml.safe_load(configmap["data"][DEFAULT_CONFIGURATION_FILE]) + olsconfig["ols_config"]["user_data_collection"] = { "feedback_disabled": False, "feedback_storage": "/app-root/ols-user-data/feedback", @@ -78,82 +81,20 @@ def update_ols_configmap() -> None: "transcripts_storage": "/app-root/ols-user-data/transcripts", } - # Update the configmap - configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) - updated_configmap = yaml.dump(configmap) - cluster_utils.run_oc(["apply", "-f", "-"], command=updated_configmap) - print("OLS configmap updated successfully") - - except Exception as e: - raise RuntimeError( - f"Failed to update OLS configmap with e2e settings: {e}" - ) from e - - -def setup_service_accounts(namespace: str) -> None: - """Set up service accounts and access roles. - - Args: - namespace: The Kubernetes namespace to create service accounts in. - """ - print("Ensuring 'test-user' service account exists...") - cluster_utils.run_oc( - ["create", "sa", "test-user", "-n", namespace], - ignore_existing_resource=True, - ) - - print("Ensuring 'metrics-test-user' service account exists...") - cluster_utils.run_oc( - ["create", "sa", "metrics-test-user", "-n", namespace], - ignore_existing_resource=True, - ) - - print("Granting access roles to service accounts...") - cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access") - cluster_utils.grant_sa_user_access( - "metrics-test-user", "lightspeed-operator-ols-metrics-reader" - ) - - -def setup_rbac(namespace: str) -> None: - """Set up pod-reader role and binding. - - Args: - namespace: The Kubernetes namespace for RBAC configuration. - """ - print("Ensuring 'pod-reader' role and rolebinding exist...") - cluster_utils.run_oc( - [ - "create", - "role", - "pod-reader", - "--verb=get,list", - "--resource=pods", - "--namespace", - namespace, - ], - ignore_existing_resource=True, - ) - - cluster_utils.run_oc( - [ - "create", - "rolebinding", - "test-user-pod-reader", - "--role=pod-reader", - f"--serviceaccount={namespace}:test-user", - "--namespace", - namespace, - ], - ignore_existing_resource=True, - ) - print("RBAC setup verified.") + configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) + updated_configmap = yaml.dump(configmap) + cluster_utils.run_oc(["apply", "-f", "-"], command=updated_configmap) + print("Data export configmap settings applied successfully") + except Exception as e: + raise RuntimeError( + f"Failed to update OLS configmap with data export settings: {e}" + ) from e def wait_for_deployment() -> None: """Wait for OLS deployment and pods to be ready. - Ensures the lightspeed-app-server deployment is available and pods are running. + Ensures the service deployment is available and pods are running. """ print("Waiting for OLS deployment to be available...") retry_until_timeout_or_success( @@ -163,42 +104,18 @@ def wait_for_deployment() -> None: [ "get", "deployment", - "lightspeed-app-server", + OLS_SERVICE_DEPLOYMENT, "--ignore-not-found", "-o", "name", ] ).stdout.strip() - == "deployment.apps/lightspeed-app-server", + == f"deployment.apps/{OLS_SERVICE_DEPLOYMENT}", "Waiting for lightspeed-app-server deployment to be detected", ) print("Waiting for pods to be ready...") - cluster_utils.wait_for_running_pod() - - -def setup_route() -> str: - """Set up route and return OLS URL. - - Returns: - The HTTPS URL for accessing the OLS service. - """ - try: - cluster_utils.run_oc(["delete", "route", "ols"], ignore_existing_resource=False) - except Exception: - print("No existing route to delete. Continuing...") - - print("Creating route for OLS access") - cluster_utils.run_oc( - ["create", "-f", "tests/config/operator_install/route.yaml"], - ignore_existing_resource=False, - ) - - url = cluster_utils.run_oc( - ["get", "route", "ols", "-o", "jsonpath='{.spec.host}'"] - ).stdout.strip("'") - - return f"https://{url}" + cluster_utils.wait_for_running_pod(name=OLS_SERVICE_DEPLOYMENT) def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 @@ -215,21 +132,14 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 provider_list = provider_env.split() or ["openai"] ols_image = os.getenv("OLS_IMAGE", "") namespace = "openshift-lightspeed" + creds = os.getenv("PROVIDER_KEY_PATH", "empty") + cluster_utils.run_oc( + ["project", "openshift-lightspeed"], ignore_existing_resource=True + ) - print("Checking for existing app server deployment...") - try: - cluster_utils.run_oc( - ["scale", "deployment/lightspeed-app-server", "--replicas", "0"] - ) - retry_until_timeout_or_success( - 30, - 3, - lambda: not cluster_utils.get_pod_by_prefix(fail_not_found=False), - "Waiting for old app server pod to terminate", - ) - print("Old app server scaled down") - except Exception as e: - print(f"No existing app server to scale down (this is OK): {e}") + # Update lcore setting if LCORE is enabled + if LCORE_ENABLED: + update_lcore_setting() # Scaling operator to 1 replica to allow finalizer to run for olsconfig cluster_utils.run_oc( [ @@ -240,26 +150,14 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 ] ) # Wait for operator pod to be ready - retry_until_timeout_or_success( - 60, - 5, - lambda: ( - pods := cluster_utils.get_pod_by_prefix( - prefix="lightspeed-operator-controller-manager", fail_not_found=False - ) - ) - and all( - status == "true" - for status in cluster_utils.get_container_ready_status(pods[0]) - ), - "Waiting for operator to be ready", - ) + cluster_utils.wait_for_running_pod("lightspeed-operator-controller-manager") try: - cluster_utils.run_oc(["delete", "olsconfig", "cluster", "--ignore-not-found"]) - print(" Old OLSConfig CR removed") + cluster_utils.run_oc(["delete", "secret", "llmcreds", "--ignore-not-found"]) except Exception as e: - print(f"Could not delete old OLSConfig: {e}") - + print(f"Could not delete old secret: {e}") + creds_list = creds.split() + for i, prov in enumerate(provider_list): + create_secrets(prov, creds_list[i], len(provider_list)) try: apply_olsconfig(provider_list) print("New OLSConfig CR applied") @@ -278,7 +176,7 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 [ "get", "deployment", - "lightspeed-app-server", + OLS_SERVICE_DEPLOYMENT, "--ignore-not-found", "-o", "jsonpath={.status.replicas}", @@ -309,7 +207,7 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 # Scale down app server to apply e2e configurations print("Scaling down app server to apply e2e configurations...") cluster_utils.run_oc( - ["scale", "deployment/lightspeed-app-server", "--replicas", "0"] + ["scale", f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "0"] ) retry_until_timeout_or_success( @@ -322,7 +220,8 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 # Update configmap with e2e-specific settings - FAIL FAST if this breaks print("Updating configmap with e2e test settings...") - update_ols_configmap() + if not LCORE_ENABLED: + update_ols_configmap() print(" Configmap updated successfully") # Apply test image if ols_image: @@ -336,7 +235,7 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 cluster_utils.run_oc( [ "patch", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--type", "json", "-p", @@ -351,7 +250,7 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 # Scale back up print("Scaling up app server with new configuration...") cluster_utils.run_oc( - ["scale", "deployment/lightspeed-app-server", "--replicas", "1"] + ["scale", f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "1"] ) # Wait for deployment to be ready @@ -371,10 +270,19 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 except Exception as e: print(f"Warning: Could not ensure pod-reader role/binding: {e}") + # Fetch tokens for service accounts + token, metrics_token = get_service_account_tokens() + + # Set up route and get URL + ols_url = setup_route() + # Configure exporter for e2e tests with proper settings try: print("Configuring exporter for e2e tests...") + # Create client for the exporter configuration + test_client = client_utils.get_http_client(ols_url, token) configure_exporter_for_e2e_tests( + client=test_client, interval_seconds=3600, # 1 hour to prevent interference ingress_env="stage", log_level="DEBUG", @@ -385,19 +293,6 @@ def adapt_ols_config() -> tuple[str, str, str]: # pylint: disable=R0915 print(f"Warning: Could not configure exporter: {e}") print("Tests may experience interference from data collector") - # Fetch tokens for service accounts - print("Fetching tokens for service accounts...") - token = cluster_utils.get_token_for("test-user") - metrics_token = cluster_utils.get_token_for("metrics-test-user") - - # Set up route and get URL - ols_url = setup_route() - - # Wait for OLS to be ready - print(f"Waiting for OLS to be ready at {ols_url}...") - if not wait_for_ols(ols_url, timeout=180): - raise RuntimeError("OLS failed to become ready after configuration") - print("OLS configuration and access setup completed successfully.") return ols_url, token, metrics_token diff --git a/tests/e2e/utils/cluster.py b/tests/e2e/utils/cluster.py index 3252d632c..7c417ee78 100644 --- a/tests/e2e/utils/cluster.py +++ b/tests/e2e/utils/cluster.py @@ -362,10 +362,10 @@ def wait_for_running_pod( get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False) ) == 1, - "Waiting for service pod in running state", + f"Waiting for {name} pod in running state", ) if not r: - raise Exception("Timed out waiting for new OLS pod to be ready") + raise Exception(f"Timed out waiting for {name} pod to be ready") def pod_has_containers_ready(): pods = get_pod_by_prefix(prefix=name, namespace=namespace, fail_not_found=False) @@ -384,8 +384,9 @@ def pod_has_containers_ready(): ols_config_suffix = os.getenv("OLS_CONFIG_SUFFIX", "default") tool_calling_enabled = "tool_calling" in ols_config_suffix - if tool_calling_enabled: - return ready_containers >= 2 + if name == "lightspeed-app-server-": + if tool_calling_enabled: + return ready_containers >= 2 return ready_containers >= 1 # wait for the containers in the server pod to become ready diff --git a/tests/e2e/utils/constants.py b/tests/e2e/utils/constants.py index ddabb709b..6973d1bf6 100644 --- a/tests/e2e/utils/constants.py +++ b/tests/e2e/utils/constants.py @@ -1,5 +1,7 @@ """Constants for end-to-end tests.""" +import os + # timeout settings BASIC_ENDPOINTS_TIMEOUT = 5 NON_LLM_REST_API_TIMEOUT = 20 @@ -18,3 +20,11 @@ OLS_USER_DATA_COLLECTION_INTERVAL_SHORT = ( 5 # 5 seconds - used only in data collection test ) + +LCORE_ENABLED = True if os.getenv("LCORE", "False").lower() in ("true", "1", "t") else False + +OLS_SERVICE_DEPLOYMENT = ( + "lightspeed-stack-deployment" + if os.getenv("LCORE", "False").lower() in ("true", "1", "t") + else "lightspeed-app-server" +) diff --git a/tests/e2e/utils/data_collector_control.py b/tests/e2e/utils/data_collector_control.py index 963330443..dfcc6d808 100644 --- a/tests/e2e/utils/data_collector_control.py +++ b/tests/e2e/utils/data_collector_control.py @@ -11,7 +11,7 @@ import yaml from tests.e2e.utils import cluster as cluster_utils -from tests.e2e.utils.constants import OLS_USER_DATA_PATH +from tests.e2e.utils.constants import OLS_SERVICE_DEPLOYMENT, OLS_USER_DATA_PATH from tests.e2e.utils.wait_for_ols import wait_for_ols # Exporter config map constants @@ -168,7 +168,7 @@ def set_exporter_collection_interval(self, interval_seconds: int) -> None: cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "-n", EXPORTER_NAMESPACE, "--replicas=0", @@ -203,13 +203,14 @@ def set_exporter_collection_interval(self, interval_seconds: int) -> None: time.sleep(5) def restart_exporter_container( - self, container_name: str = "lightspeed-to-dataverse-exporter" + self, client, container_name: str = "lightspeed-to-dataverse-exporter" ) -> None: """Restart the exporter by scaling deployment back up. The deployment controller will create a new pod with the updated config. Args: + client: httpx Client instance for making API calls. container_name: Name of the exporter container (for verification). """ try: @@ -217,7 +218,7 @@ def restart_exporter_container( cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "-n", EXPORTER_NAMESPACE, "--replicas=1", @@ -249,7 +250,9 @@ def restart_exporter_container( # Wait for OLS API to be ready (not just pod running) print("Waiting for OLS API to be ready...") ols_url = cluster_utils.get_ols_url("ols") - if not wait_for_ols(ols_url, timeout=120, interval=5): + if not wait_for_ols( + ols_url, client=client, timeout=120, interval=5 + ): print("Warning: OLS readiness check timed out") else: print("OLS API is ready") @@ -310,6 +313,7 @@ def _verify_config_applied( def configure_exporter_for_e2e_tests( + client, interval_seconds: int = 3600, ingress_env: str = "stage", cp_offline_token: str | None = None, @@ -319,6 +323,7 @@ def configure_exporter_for_e2e_tests( """Configure exporter for e2e tests with proper settings. Args: + client: httpx Client instance for making API calls. interval_seconds: Collection interval (default: 3600 = 1 hour). ingress_env: Ingress environment - "stage" or "prod" (default: "stage"). cp_offline_token: Auth token for ingress server (required for stage). @@ -344,7 +349,7 @@ def configure_exporter_for_e2e_tests( ingress_server_auth_token=cp_offline_token or None, log_level=log_level, ) - controller.restart_exporter_container() + controller.restart_exporter_container(client) def patch_exporter_mode_to_manual() -> None: @@ -379,7 +384,7 @@ def patch_exporter_mode_to_manual() -> None: cluster_utils.run_oc( [ "patch", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "-n", EXPORTER_NAMESPACE, "--type=json", @@ -391,6 +396,7 @@ def patch_exporter_mode_to_manual() -> None: def prepare_for_data_collection_test( + client, short_interval_seconds: int = 5, ) -> DataCollectorControl: """Prepare the environment for testing data collection. @@ -401,6 +407,7 @@ def prepare_for_data_collection_test( - No cleanup needed (operator will reconcile when it runs next) Args: + client: httpx Client instance for making API calls. short_interval_seconds: Collection interval for testing (default: 5s). Returns: @@ -429,7 +436,7 @@ def prepare_for_data_collection_test( cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "-n", EXPORTER_NAMESPACE, "--replicas=0", @@ -468,7 +475,7 @@ def prepare_for_data_collection_test( patch_exporter_mode_to_manual() # Scale up and wait for pod - controller.restart_exporter_container() + controller.restart_exporter_container(client) # Wait for first collection cycle wait_time = short_interval_seconds + 3 diff --git a/tests/e2e/utils/metrics.py b/tests/e2e/utils/metrics.py index 90c65dab9..fb2968343 100644 --- a/tests/e2e/utils/metrics.py +++ b/tests/e2e/utils/metrics.py @@ -96,7 +96,6 @@ def get_enabled_model_and_provider(client): """Read configured model and provider from metrics.""" response = read_metrics(client) lines = [line.strip() for line in response.split("\n")] - labels = get_metric_labels(lines, "ols_provider_model_configuration", "1.0") return labels["model"], labels["provider"] diff --git a/tests/e2e/utils/ols_installer.py b/tests/e2e/utils/ols_installer.py index de1638b4e..29fc0b6ec 100644 --- a/tests/e2e/utils/ols_installer.py +++ b/tests/e2e/utils/ols_installer.py @@ -1,15 +1,17 @@ """Functions to install the service onto an OCP cluster using the OLS operator.""" +import json import os import subprocess import yaml from ols.constants import DEFAULT_CONFIGURATION_FILE +from tests.e2e.utils import client as client_utils from tests.e2e.utils import cluster as cluster_utils +from tests.e2e.utils.constants import OLS_SERVICE_DEPLOYMENT, LCORE_ENABLED from tests.e2e.utils.data_collector_control import configure_exporter_for_e2e_tests from tests.e2e.utils.retry import retry_until_timeout_or_success -from tests.e2e.utils.wait_for_ols import wait_for_ols OC_COMMAND_RETRY_COUNT = 120 OC_COMMAND_RETRY_DELAY = 5 @@ -17,31 +19,38 @@ disconnected = os.getenv("DISCONNECTED", "") -def create_and_config_sas() -> tuple[str, str]: - """Create and provide access to service accounts for testing. +def setup_service_accounts(namespace: str) -> None: + """Set up service accounts and access roles. - Returns: - tuple containing token and metrics token. + Args: + namespace: The Kubernetes namespace to create service accounts in. """ + print("Ensuring 'test-user' service account exists...") cluster_utils.run_oc( - ["project", "openshift-lightspeed"], ignore_existing_resource=True + ["create", "sa", "test-user", "-n", namespace], + ignore_existing_resource=True, ) - cluster_utils.create_user("test-user", ignore_existing_resource=True) - cluster_utils.create_user("metrics-test-user", ignore_existing_resource=True) - token = cluster_utils.get_token_for("test-user") - metrics_token = cluster_utils.get_token_for("metrics-test-user") - print("created test service account users") - # grant the test service accounts permission to query ols and retrieve metrics + print("Ensuring 'metrics-test-user' service account exists...") + cluster_utils.run_oc( + ["create", "sa", "metrics-test-user", "-n", namespace], + ignore_existing_resource=True, + ) + + print("Granting access roles to service accounts...") cluster_utils.grant_sa_user_access("test-user", "lightspeed-operator-query-access") cluster_utils.grant_sa_user_access( "metrics-test-user", "lightspeed-operator-ols-metrics-reader" ) - print("test service account permissions granted") - # grant pod listing permission to test-user - to test the tools, - # more specifically the we need the test-user be able to see pods - # in the namespace + +def setup_rbac(namespace: str) -> None: + """Set up pod-reader role and binding. + + Args: + namespace: The Kubernetes namespace for RBAC configuration. + """ + print("Ensuring 'pod-reader' role and rolebinding exist...") cluster_utils.run_oc( [ "create", @@ -49,7 +58,8 @@ def create_and_config_sas() -> tuple[str, str]: "pod-reader", "--verb=get,list", "--resource=pods", - "--namespace=openshift-lightspeed", + "--namespace", + namespace, ], ignore_existing_resource=True, ) @@ -60,17 +70,109 @@ def create_and_config_sas() -> tuple[str, str]: "rolebinding", "test-user-pod-reader", "--role=pod-reader", - "--serviceaccount=openshift-lightspeed:test-user", - "--namespace=openshift-lightspeed", + f"--serviceaccount={namespace}:test-user", + "--namespace", + namespace, ], ignore_existing_resource=True, ) + print("RBAC setup verified.") - print("Granted test-user permission to list pods.") +def get_service_account_tokens() -> tuple[str, str]: + """Get tokens for test service accounts. + + Returns: + tuple containing token and metrics token. + """ + print("Fetching tokens for service accounts...") + token = cluster_utils.get_token_for("test-user") + metrics_token = cluster_utils.get_token_for("metrics-test-user") return token, metrics_token +def update_lcore_setting() -> None: + """Update the --use-lcore argument in the CSV if LCORE is enabled. + + Checks if LCORE environment variable is enabled and ensures the + --use-lcore argument in the ClusterServiceVersion is set to true. + """ + print("LCORE enabled, checking CSV configuration...") + namespace = "openshift-lightspeed" + + # Get the CSV name + csv_name_result = cluster_utils.run_oc( + ["get", "csv", "-n", namespace, "-o", "name"] + ) + csv_full_name = csv_name_result.stdout.strip() + if not csv_full_name: + print("No CSV found in namespace, skipping LCORE update") + return + + csv_name = csv_full_name.replace("clusterserviceversion.operators.coreos.com/", "") + + # Get current args from the CSV + args_result = cluster_utils.run_oc( + [ + "get", + "csv", + csv_name, + "-n", + namespace, + "-o", + "json", + ] + ) + csv_data = json.loads(args_result.stdout) + args = csv_data["spec"]["install"]["spec"]["deployments"][0]["spec"]["template"][ + "spec" + ]["containers"][0]["args"] + + # Check if --use-lcore exists and its value + lcore_arg_index = None + lcore_value = None + for i, arg in enumerate(args): + if arg.startswith("--use-lcore="): + lcore_arg_index = i + lcore_value = arg.split("=", 1)[1] + break + + if lcore_arg_index is None: + print("--use-lcore argument not found in CSV") + return + + if lcore_value == "true": + print("--use-lcore already set to true, no update needed") + return + + print(f"--use-lcore is set to {lcore_value}, updating to true...") + + # Update the argument + patch = ( + f'[{{"op": "replace", "path": "/spec/install/spec/deployments/0/spec/' + f'template/spec/containers/0/args/{lcore_arg_index}", ' + f'"value": "--use-lcore=true"}}]' + ) + + cluster_utils.run_oc( + [ + "patch", + "csv", + csv_name, + "-n", + namespace, + "--type", + "json", + "-p", + patch, + ] + ) + cluster_utils.wait_for_running_pod( + name="lightspeed-operator-controller-manager", namespace="openshift-lightspeed" + ) + print("--use-lcore updated to true successfully") + + def update_ols_config() -> None: """Create the ols config configmap with log and collector config for e2e tests. @@ -116,10 +218,33 @@ def update_ols_config() -> None: configmap["data"][DEFAULT_CONFIGURATION_FILE] = yaml.dump(olsconfig) updated_configmap = yaml.dump(configmap) - cluster_utils.run_oc(["delete", "configmap", "olsconfig"]) cluster_utils.run_oc(["apply", "-f", "-"], command=updated_configmap) +def setup_route() -> str: + """Set up route and return OLS URL. + + Returns: + The HTTPS URL for accessing the OLS service. + """ + try: + cluster_utils.run_oc(["delete", "route", "ols"], ignore_existing_resource=False) + except Exception: + print("No existing route to delete. Continuing...") + + print("Creating route for OLS access") + cluster_utils.run_oc( + ["create", "-f", "tests/config/operator_install/route.yaml"], + ignore_existing_resource=False, + ) + + url = cluster_utils.run_oc( + ["get", "route", "ols", "-o", "jsonpath='{.spec.host}'"] + ).stdout.strip("'") + + return f"https://{url}" + + def replace_ols_image(ols_image: str) -> None: """Replace the existing ols image with a new one. @@ -146,7 +271,7 @@ def replace_ols_image(ols_image: str) -> None: cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "0", ] @@ -163,7 +288,7 @@ def replace_ols_image(ols_image: str) -> None: # update the OLS deployment to use the new image from CI/OLS_IMAGE env var patch = f"""[{{"op": "replace", "path": "/spec/template/spec/containers/0/image", "value":"{ols_image}"}}]""" # noqa: E501 cluster_utils.run_oc( - ["patch", "deployment/lightspeed-app-server", "--type", "json", "-p", patch] + ["patch", f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--type", "json", "-p", patch] ) @@ -283,8 +408,10 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no cluster_utils.run_oc( ["project", "openshift-lightspeed"], ignore_existing_resource=True ) - token, metrics_token = create_and_config_sas() - + namespace = "openshift-lightspeed" + setup_service_accounts(namespace) + setup_rbac(namespace) + token, metrics_token = get_service_account_tokens() # wait for the operator to install # time.sleep(3) # not sure if it is needed but it fails sometimes r = retry_until_timeout_or_success( @@ -309,6 +436,8 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no provider = os.getenv("PROVIDER", "openai") creds = os.getenv("PROVIDER_KEY_PATH", "empty") + if LCORE_ENABLED: + update_lcore_setting() # create the llm api key secret ols will mount provider_list = provider.split() creds_list = creds.split() @@ -383,13 +512,13 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no [ "get", "deployment", - "lightspeed-app-server", + f"{OLS_SERVICE_DEPLOYMENT}", "--ignore-not-found", "-o", "name", ] ).stdout - == "deployment.apps/lightspeed-app-server\n", + == f"deployment.apps/{OLS_SERVICE_DEPLOYMENT}\n", "Waiting for OLS API server deployment to be created", ) if not r: @@ -428,7 +557,7 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "0", ] @@ -438,14 +567,14 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no cluster_utils.run_oc( [ "scale", - "deployment/lightspeed-app-server", + f"deployment/{OLS_SERVICE_DEPLOYMENT}", "--replicas", "1", ] ) print("Deployment updated, waiting for new pod to be ready") # Wait for the pod to start being created and then wait for it to start running. - cluster_utils.wait_for_running_pod() + cluster_utils.wait_for_running_pod(name=OLS_SERVICE_DEPLOYMENT) print("-" * 50) print("OLS pod seems to be ready") @@ -465,15 +594,22 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no # expect it to be (must-gather will also collect this) print( cluster_utils.run_oc( - ["get", "deployment", "lightspeed-app-server", "-o", "yaml"] + ["get", "deployment", OLS_SERVICE_DEPLOYMENT, "-o", "yaml"] ).stdout ) print("-" * 50) + + # Set up route and get URL first + ols_url = setup_route() + if not disconnected: # Configure exporter for e2e tests with proper settings try: print("Configuring exporter for e2e tests...") + # Create client for the exporter configuration + test_client = client_utils.get_http_client(ols_url, token) configure_exporter_for_e2e_tests( + client=test_client, interval_seconds=3600, # 1 hour to prevent interference ingress_env="stage", log_level="debug", @@ -484,22 +620,4 @@ def install_ols() -> tuple[str, str, str]: # pylint: disable=R0915, R0912 # no print(f"Warning: Could not configure exporter: {e}") print("Tests may experience interference from data collector") - try: - cluster_utils.run_oc( - [ - "delete", - "route", - "ols", - ], - ) - except subprocess.CalledProcessError: - print("No route exists, creating it.") - # create a route so tests can access OLS directly - cluster_utils.run_oc(["create", "-f", "tests/config/operator_install/route.yaml"]) - - url = cluster_utils.run_oc( - ["get", "route", "ols", "-o", "jsonpath='{.spec.host}'"] - ).stdout.strip("'") - ols_url = f"https://{url}" - wait_for_ols(ols_url) return ols_url, token, metrics_token diff --git a/tests/e2e/utils/wait_for_ols.py b/tests/e2e/utils/wait_for_ols.py index d1547781c..9518ea6c0 100644 --- a/tests/e2e/utils/wait_for_ols.py +++ b/tests/e2e/utils/wait_for_ols.py @@ -10,15 +10,19 @@ from requests.exceptions import SSLError from urllib3.exceptions import InsecureRequestWarning +from tests.e2e.utils.constants import ( + BASIC_ENDPOINTS_TIMEOUT, +) + warnings.filterwarnings("ignore", category=InsecureRequestWarning) -# ruff: noqa: S501 -def wait_for_ols(url, timeout=300, interval=10): +def wait_for_ols(url, client, timeout=300, interval=10): """Wait for the OLS to become ready by checking its readiness endpoint. Args: url (str): The base URL of the OLS service. + client (Client): httpx client with configured headers timeout (int, optional): The maximum time to wait in seconds. Default is 600. interval (int, optional): The interval between readiness checks in seconds. Default is 10. @@ -30,14 +34,14 @@ def wait_for_ols(url, timeout=300, interval=10): for attempt in range(1, attempts + 1): print(f"Checking OLS readiness, attempt {attempt} of {attempts}") try: - response = requests.get(f"{url}/readiness", verify=True, timeout=5) + response = client.get("/readiness", timeout=BASIC_ENDPOINTS_TIMEOUT) if response.status_code == requests.codes.ok: print("OLS is ready") return True except SSLError: print("SSL error detected, retrying without SSL verification") try: - response = requests.get(f"{url}/readiness", verify=False, timeout=5) + response = client.get("/readiness", timeout=BASIC_ENDPOINTS_TIMEOUT) if response.status_code == requests.codes.ok: print("OLS is ready") return True diff --git a/tests/scripts/test-e2e-cluster.sh b/tests/scripts/test-e2e-cluster.sh index ca1d2474d..307a7b8c1 100755 --- a/tests/scripts/test-e2e-cluster.sh +++ b/tests/scripts/test-e2e-cluster.sh @@ -35,7 +35,7 @@ function run_suites() { # If changes are done in this file, please make sure they reflect in test-e2e-cluster-periodics.sh and test-evaluation.sh # runsuite arguments: - # suiteid test_tags provider provider_keypath model ols_image os_config_suffix + # suiteid test_tags provider provider_keypath model ols_image ols_config_suffix # empty test_tags means run all tests run_suite "azure_openai" "not certificates and not (tool_calling and not smoketest and not rag) and not byok1 and not byok2 and not quota_limits and not data_export" "azure_openai" "$AZUREOPENAI_PROVIDER_KEY_PATH" "gpt-4o-mini" "$OLS_IMAGE" "default" (( rc = rc || $? ))