comet-ml · alexkuzmik · May 15, 2024 · May 15, 2024 · May 15, 2024 · May 15, 2024
@@ -0,0 +1,54 @@
+name: E2E Tests Comet LLM
+env:
+  COMET_RAISE_EXCEPTIONS_ON_ERROR: "1"
+  COMET_API_KEY:  ${{ secrets.PRODUCTION_CI_COMET_API_KEY }}
+on:
+  pull_request:
+
+jobs:
+  UnitTests:
+    name: E2E_Python_${{matrix.python_version}}
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v3
+
+      - name: Set the project name
+        run: |
+          echo "COMET_PROJECT_NAME=comet-llm-e2e-tests-py${{ matrix.python_version }}" >> $GITHUB_ENV
+
+      - name: Print environment variables
+        run: env
+
+      - name: Print event object
+        run: cat $GITHUB_EVENT_PATH
+
+      - name: Print the PR title
+        run: echo "${{ github.event.pull_request.title }}"
+
+      - name: Setup Python ${{ matrix.python_version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+
+      - name: Install comet-llm
+        run: pip install -e .
+
+      - name: Install test requirements
+        run: |
+          cd ./tests
+          pip install --no-cache-dir --disable-pip-version-check -r test_requirements.txt
+
+      - name: Running SDK e2e Tests
+        run: python -m pytest --cov=src/comet_llm --cov-report=html:coverage_report_${{matrix.python_version}} -vv tests/e2e/
+
+      - name: archive coverage report
+        uses: actions/upload-artifact@v3
+        with:
+          name: coverage_report_${{matrix.python_version}}
+          path: coverage_report_${{matrix.python_version}}
@@ -0,0 +1,9 @@
+import comet_ml
+
+import pytest
+
+
+@pytest.fixture(scope="session")
+def comet_api():
+    api = comet_ml.API(cache=False)
+    return api
@@ -0,0 +1,47 @@
+import logging
+from typing import TYPE_CHECKING
+
+import comet_llm
+
+from . import verifier
+
+if TYPE_CHECKING:
+    import comet_ml
+
+LOGGER = logging.getLogger(__name__)
+
+
+def test_start_and_end_chain__happyflow(comet_api: "comet_ml.API"):
+    # Neither chain nor span inputs and outputs are not verified for now
+
+    comet_llm.start_chain(
+        inputs="chain-inputs",
+        tags=["tag1", "tag2"],
+        metadata={"start-metadata-key": "start-metadata-value"}
+    )
+
+    with comet_llm.Span(category="grand-parent", inputs="grand-parent-span-input") as grandparent_span:
+        with comet_llm.Span(category="parent", inputs="parent-span-input") as parent_span:
+            with comet_llm.Span(category="llm-call", inputs="llm-call-input") as llm_call_span:
+                llm_call_span.set_outputs({"llm-call-output-key": "llm-call-output-value"})
+            parent_span.set_outputs({"parent-output-key": "parent-output-value"})
+        grandparent_span.set_outputs({"grandparent-output-key": "grandparent-output-value"})
+
+    llm_result = comet_llm.end_chain(
+        outputs="chain-outputs",
+        metadata={"end-metadata-key": "end-metadata-value"}
+    )
+
+    print("test_start_and_end_chain__happyflow trace ID: %s" % llm_result.id)
+
+    verifier.verify_trace(
+        comet_api,
+        llm_result.id,
+        expected_tags=["tag1", "tag2"],
+        expected_metadata={
+            "start-metadata-key": "start-metadata-value",
+            "end-metadata-key": "end-metadata-value",
+        }
+    )
+
+
@@ -0,0 +1,38 @@
+import logging
+from typing import TYPE_CHECKING
+
+import comet_llm
+
+from . import verifier
+
+if TYPE_CHECKING:
+    import comet_ml
+
+LOGGER = logging.getLogger(__name__)
+
+def test_log_prompt__happyflow(comet_api: "comet_ml.API"):
+    # prompt and output are not verified for now
+
+    llm_result = comet_llm.log_prompt(
+        prompt="the-input",
+        output="the-output",
+        duration=42,
+        tags=["tag1", "tag2"],
+        metadata={
+            "metadata-key-1": "metadata-value-1",
+            "metadata-key-2": 123,
+        }
+    )
+
+    print("test_log_prompt__happyflow trace ID: %s" % llm_result.id)
+
+    verifier.verify_trace(
+        comet_api,
+        llm_result.id,
+        expected_duration=42,
+        expected_tags=["tag1", "tag2"],
+        expected_metadata={
+            "metadata-key-1": "metadata-value-1",
+            "metadata-key-2": 123,
+        }
+    )
@@ -0,0 +1,75 @@
+from typing import Any, Dict, List, Optional
+
+import comet_ml
+
+from .. import testlib
+
+
+def verify_trace(
+        comet_api: "comet_ml.API",
+        trace_id: str,
+        expected_duration: Optional[float] = None,
+        expected_tags: Optional[List[float]] = None,
+        expected_metadata: Optional[Dict[str, Any]] = None,
+    ):
+    """
+    Performs assertions for various trace (prompt | chain) attributes.
+    As of today it can check the fact that:
+        - Trace was saved on the backend side (as experiment)
+        - It contains comet_llm_data.json asset
+        - Expected duration, tags, metadata are the same as the actual ones.
+
+    The function takes into account that some data might not be avalable
+    right after logging, so it can wait for some pieces of data (except for the check
+    for trace and asset existence).
+
+    TODO: probably add assertions for asset content. E.g. today trace input and output
+    are not verified, however, they are
+    """
+    api_experiment: "comet_ml.APIExperiment" = comet_api.get_experiment_by_id(experiment=trace_id)
+    assert api_experiment is not None, "Failed to verify that trace was saved"
+
+    assets = api_experiment.get_asset_list()
+    assert len(assets) == 1, "Failed to verify that trace contains asset"
+    assert assets[0]["fileName"] == "comet_llm_data.json"
+
+    if expected_duration is not None:
+        testlib.until(
+            function=lambda: len(api_experiment.get_metrics(metric="chain_duration")) != 0
+        ), "Failed to get duration (a.k.a. chain_duration metric)"
+        metrics = api_experiment.get_metrics(metric="chain_duration")
+        _assert_equal_with_conversion_to_left_type(
+            expected_duration,
+            metrics[0]["metricValue"]
+        )
+
+    if expected_tags is not None:
+        assert testlib.until(
+            function=lambda: len(api_experiment.get_tags()) != 0
+        ), "Failed to get tags"
+        actual_tags = api_experiment.get_tags()
+        assert actual_tags == expected_tags
+
+    if expected_metadata is not None:
+        assert testlib.until(
+            function=lambda: len(api_experiment.get_parameters_summary()) != 0
+        ), "Failed to get trace metadata (a.k.a. parameters)"
+        actual_parameters = api_experiment.get_parameters_summary()
+        assert len(actual_parameters) == len(expected_metadata)
+        for actual_parameter in actual_parameters:
+            name = actual_parameter["name"]
+            _assert_equal_with_conversion_to_left_type(
+                expected_metadata[name],
+                actual_parameter["valueCurrent"]
+            )
+
+
+def _assert_equal_with_conversion_to_left_type(left_value: Any, right_value: Any) -> None:
+    """
+    Used for more convenient assertions with
+    string data returned from backend
+    """
+    left_type = type(left_value)
+    right_value_converted = left_type(right_value)
+
+    assert left_value == right_value_converted
@@ -1,9 +1,12 @@
 import contextlib
+import logging
 import os
+import time
+from typing import Callable, Dict
 
 
 @contextlib.contextmanager
-def environ(env):
+def environ(env: Dict[str, str]):
     """Temporarily set environment variables inside the context manager and
     fully restore previous environment afterwards
     """
@@ -17,4 +20,16 @@ def environ(env):
             if value is None:
                 del os.environ[key]
             else:
-                os.environ[key] = value
+                os.environ[key] = value
+
+
+def until(function: Callable, sleep: float = 0.5, max_try_seconds: int = 20) -> bool:
+    """
+    Try assert function(). 20 seconds max
+    """
+    start_time = time.time()
+    while not function():
+        if (time.time() - start_time) > max_try_seconds:
+            return False
+        time.sleep(sleep)
+    return True