Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CM-10024] introduce tests against real backend #132

Merged
merged 4 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
name: E2E Tests Comet LLM
env:
COMET_RAISE_EXCEPTIONS_ON_ERROR: "1"
COMET_API_KEY: ${{ secrets.PRODUCTION_CI_COMET_API_KEY }}
on:
pull_request:

jobs:
UnitTests:
name: E2E_Python_${{matrix.python_version}}
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
python_version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]

steps:
- name: Check out code
uses: actions/checkout@v3

- name: Set the project name
run: |
echo "COMET_PROJECT_NAME=comet-llm-e2e-tests-py${{ matrix.python_version }}" >> $GITHUB_ENV

- name: Print environment variables
run: env

- name: Print event object
run: cat $GITHUB_EVENT_PATH

- name: Print the PR title
run: echo "${{ github.event.pull_request.title }}"

- name: Setup Python ${{ matrix.python_version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python_version }}

- name: Install comet-llm
run: pip install -e .

- name: Install test requirements
run: |
cd ./tests
pip install --no-cache-dir --disable-pip-version-check -r test_requirements.txt

- name: Running SDK e2e Tests
run: python -m pytest --cov=src/comet_llm --cov-report=html:coverage_report_${{matrix.python_version}} -vv tests/e2e/

- name: archive coverage report
uses: actions/upload-artifact@v3
with:
name: coverage_report_${{matrix.python_version}}
path: coverage_report_${{matrix.python_version}}
Empty file added tests/e2e/__init__.py
Empty file.
9 changes: 9 additions & 0 deletions tests/e2e/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import comet_ml

import pytest


@pytest.fixture(scope="session")
def comet_api():
api = comet_ml.API(cache=False)
return api
47 changes: 47 additions & 0 deletions tests/e2e/test_chains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging
from typing import TYPE_CHECKING

import comet_llm

from . import verifier

if TYPE_CHECKING:
import comet_ml

LOGGER = logging.getLogger(__name__)


def test_start_and_end_chain__happyflow(comet_api: "comet_ml.API"):
# Neither chain nor span inputs and outputs are not verified for now

comet_llm.start_chain(
inputs="chain-inputs",
tags=["tag1", "tag2"],
metadata={"start-metadata-key": "start-metadata-value"}
)

with comet_llm.Span(category="grand-parent", inputs="grand-parent-span-input") as grandparent_span:
with comet_llm.Span(category="parent", inputs="parent-span-input") as parent_span:
with comet_llm.Span(category="llm-call", inputs="llm-call-input") as llm_call_span:
llm_call_span.set_outputs({"llm-call-output-key": "llm-call-output-value"})
parent_span.set_outputs({"parent-output-key": "parent-output-value"})
grandparent_span.set_outputs({"grandparent-output-key": "grandparent-output-value"})

llm_result = comet_llm.end_chain(
outputs="chain-outputs",
metadata={"end-metadata-key": "end-metadata-value"}
)

print("test_start_and_end_chain__happyflow trace ID: %s" % llm_result.id)

verifier.verify_trace(
comet_api,
llm_result.id,
expected_tags=["tag1", "tag2"],
expected_metadata={
"start-metadata-key": "start-metadata-value",
"end-metadata-key": "end-metadata-value",
}
)


38 changes: 38 additions & 0 deletions tests/e2e/test_prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import logging
from typing import TYPE_CHECKING

import comet_llm

from . import verifier

if TYPE_CHECKING:
import comet_ml

LOGGER = logging.getLogger(__name__)

def test_log_prompt__happyflow(comet_api: "comet_ml.API"):
# prompt and output are not verified for now

llm_result = comet_llm.log_prompt(
prompt="the-input",
output="the-output",
duration=42,
tags=["tag1", "tag2"],
metadata={
"metadata-key-1": "metadata-value-1",
"metadata-key-2": 123,
}
)

print("test_log_prompt__happyflow trace ID: %s" % llm_result.id)

verifier.verify_trace(
comet_api,
llm_result.id,
expected_duration=42,
expected_tags=["tag1", "tag2"],
expected_metadata={
"metadata-key-1": "metadata-value-1",
"metadata-key-2": 123,
}
)
75 changes: 75 additions & 0 deletions tests/e2e/verifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from typing import Any, Dict, List, Optional

import comet_ml

from .. import testlib


def verify_trace(
comet_api: "comet_ml.API",
trace_id: str,
expected_duration: Optional[float] = None,
expected_tags: Optional[List[float]] = None,
expected_metadata: Optional[Dict[str, Any]] = None,
):
"""
Performs assertions for various trace (prompt | chain) attributes.
As of today it can check the fact that:
- Trace was saved on the backend side (as experiment)
- It contains comet_llm_data.json asset
- Expected duration, tags, metadata are the same as the actual ones.

The function takes into account that some data might not be avalable
right after logging, so it can wait for some pieces of data (except for the check
for trace and asset existence).

TODO: probably add assertions for asset content. E.g. today trace input and output
are not verified, however, they are
"""
api_experiment: "comet_ml.APIExperiment" = comet_api.get_experiment_by_id(experiment=trace_id)
assert api_experiment is not None, "Failed to verify that trace was saved"

assets = api_experiment.get_asset_list()
assert len(assets) == 1, "Failed to verify that trace contains asset"
assert assets[0]["fileName"] == "comet_llm_data.json"

if expected_duration is not None:
testlib.until(
function=lambda: len(api_experiment.get_metrics(metric="chain_duration")) != 0
), "Failed to get duration (a.k.a. chain_duration metric)"
metrics = api_experiment.get_metrics(metric="chain_duration")
_assert_equal_with_conversion_to_left_type(
expected_duration,
metrics[0]["metricValue"]
)

if expected_tags is not None:
assert testlib.until(
function=lambda: len(api_experiment.get_tags()) != 0
), "Failed to get tags"
actual_tags = api_experiment.get_tags()
assert actual_tags == expected_tags

if expected_metadata is not None:
assert testlib.until(
function=lambda: len(api_experiment.get_parameters_summary()) != 0
), "Failed to get trace metadata (a.k.a. parameters)"
actual_parameters = api_experiment.get_parameters_summary()
assert len(actual_parameters) == len(expected_metadata)
for actual_parameter in actual_parameters:
name = actual_parameter["name"]
_assert_equal_with_conversion_to_left_type(
expected_metadata[name],
actual_parameter["valueCurrent"]
)


def _assert_equal_with_conversion_to_left_type(left_value: Any, right_value: Any) -> None:
"""
Used for more convenient assertions with
string data returned from backend
"""
left_type = type(left_value)
right_value_converted = left_type(right_value)

assert left_value == right_value_converted
19 changes: 17 additions & 2 deletions tests/testlib.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import contextlib
import logging
import os
import time
from typing import Callable, Dict


@contextlib.contextmanager
def environ(env):
def environ(env: Dict[str, str]):
"""Temporarily set environment variables inside the context manager and
fully restore previous environment afterwards
"""
Expand All @@ -17,4 +20,16 @@ def environ(env):
if value is None:
del os.environ[key]
else:
os.environ[key] = value
os.environ[key] = value


def until(function: Callable, sleep: float = 0.5, max_try_seconds: int = 20) -> bool:
"""
Try assert function(). 20 seconds max
"""
start_time = time.time()
while not function():
if (time.time() - start_time) > max_try_seconds:
return False
time.sleep(sleep)
return True
Loading