konveyor · jmontleon · Jun 25, 2024 · Jun 21, 2024
diff --git a/.github/workflows/evaluation.yml b/.github/workflows/evaluation.yml
@@ -0,0 +1,89 @@
+name: Evaluation Matrix
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  deployments: write
+  contents: write
+
+jobs:
+  evaluation:
+    name: Performance Evaluation
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        evaluation:
+          - provider: ChatIBMGenAI
+            model_prefix: codellama
+            model: codellama-34b-instruct
+          - provider: ChatIBMGenAI
+            model_prefix: deepseek-ai
+            model: deepseek-coder-33b-instruct
+          - provider: ChatIBMGenAI
+            model_prefix: meta-llama
+            model: llama-3-70b-instruct
+            max_new_tokens: 2048
+          - provider: ChatIBMGenAI
+            model_prefix: mistralai
+            model: mistral-7b-v0-1
+          - provider: ChatIBMGenAI
+            model_prefix: mistralai
+            model: mixtral-8x7b-instruct-v01
+        test:
+          - example: example_a
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@main
+        with:
+          python-version: 3.12.3
+      - name: Update environment
+        run: |
+          echo "MAX_NEW_TOKENS=${{ matrix.evaluation.max_new_tokens }}" >> $GITHUB_ENV
+        if: matrix.evaluation.max_new_tokens != null
+      - name: Run benchmark
+        run: |
+          pip install -r requirements.txt
+          pip install -e .
+          cd kai
+          cat << EOF > config.toml
+          log_level = "info"
+          demo_mode = false
+          [incident_store]
+          provider = "postgresql"
+          [incident_store.args]
+          host = "127.0.0.1"
+          database = "kai"
+          user = "kai"
+          password = "dog8code"
+          [embeddings]
+          todo = true
+          [models]
+          provider = "${{ matrix.evaluation.provider }}"
+          [models.args]
+          model_id = "${{ matrix.evaluation.model_prefix }}/${{ matrix.evaluation.model }}"
+          EOF
+          if [[ ! -z "${MAX_NEW_TOKENS}" ]]; then
+          cat << EOF >> config.toml
+          parameters.max_new_tokens = ${{ matrix.evaluation.max_new_tokens }}
+          EOF
+          fi
+          echo [{\"name\": \
+              \"${{ matrix.evaluation.provider }}_${{ matrix.evaluation.model_prefix }}_${{ matrix.evaluation.model }}_${{ matrix.test.example }}\", \
+              \"unit\": \"Match\", \
+              \"value\": \"$(python evaluation.py --configs ./config.toml | tail -n 1 | awk '{ print $3 }')\" \
+              }] > ../output.txt
+          git checkout config.toml
+          cd ..
+        env:
+          GENAI_KEY: ${{ secrets.GENAI_KEY }}
+      - name: Store benchmark result
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          tool: customBiggerIsBetter
+          benchmark-data-dir-path: evaluations
+          output-file-path: output.txt
+          #fail-on-alert: true
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true