From ebef75bbc982400bf7ddd5abd2c769c05164ed00 Mon Sep 17 00:00:00 2001 From: Jason Montleon Date: Tue, 25 Jun 2024 11:23:42 -0400 Subject: [PATCH] Add Evaluation GH Action (#209) Signed-off-by: Jason Montleon --- .github/workflows/evaluation.yml | 89 ++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 .github/workflows/evaluation.yml diff --git a/.github/workflows/evaluation.yml b/.github/workflows/evaluation.yml new file mode 100644 index 00000000..419d30d2 --- /dev/null +++ b/.github/workflows/evaluation.yml @@ -0,0 +1,89 @@ +name: Evaluation Matrix +on: + push: + branches: + - main + +permissions: + deployments: write + contents: write + +jobs: + evaluation: + name: Performance Evaluation + runs-on: ubuntu-latest + strategy: + matrix: + evaluation: + - provider: ChatIBMGenAI + model_prefix: codellama + model: codellama-34b-instruct + - provider: ChatIBMGenAI + model_prefix: deepseek-ai + model: deepseek-coder-33b-instruct + - provider: ChatIBMGenAI + model_prefix: meta-llama + model: llama-3-70b-instruct + max_new_tokens: 2048 + - provider: ChatIBMGenAI + model_prefix: mistralai + model: mistral-7b-v0-1 + - provider: ChatIBMGenAI + model_prefix: mistralai + model: mixtral-8x7b-instruct-v01 + test: + - example: example_a + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@main + with: + python-version: 3.12.3 + - name: Update environment + run: | + echo "MAX_NEW_TOKENS=${{ matrix.evaluation.max_new_tokens }}" >> $GITHUB_ENV + if: matrix.evaluation.max_new_tokens != null + - name: Run benchmark + run: | + pip install -r requirements.txt + pip install -e . + cd kai + cat << EOF > config.toml + log_level = "info" + demo_mode = false + [incident_store] + provider = "postgresql" + [incident_store.args] + host = "127.0.0.1" + database = "kai" + user = "kai" + password = "dog8code" + [embeddings] + todo = true + [models] + provider = "${{ matrix.evaluation.provider }}" + [models.args] + model_id = "${{ matrix.evaluation.model_prefix }}/${{ matrix.evaluation.model }}" + EOF + if [[ ! -z "${MAX_NEW_TOKENS}" ]]; then + cat << EOF >> config.toml + parameters.max_new_tokens = ${{ matrix.evaluation.max_new_tokens }} + EOF + fi + echo [{\"name\": \ + \"${{ matrix.evaluation.provider }}_${{ matrix.evaluation.model_prefix }}_${{ matrix.evaluation.model }}_${{ matrix.test.example }}\", \ + \"unit\": \"Match\", \ + \"value\": \"$(python evaluation.py --configs ./config.toml | tail -n 1 | awk '{ print $3 }')\" \ + }] > ../output.txt + git checkout config.toml + cd .. + env: + GENAI_KEY: ${{ secrets.GENAI_KEY }} + - name: Store benchmark result + uses: benchmark-action/github-action-benchmark@v1 + with: + tool: customBiggerIsBetter + benchmark-data-dir-path: evaluations + output-file-path: output.txt + #fail-on-alert: true + github-token: ${{ secrets.GITHUB_TOKEN }} + auto-push: true