Evals #123
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Evals | |
on: | |
schedule: | |
- cron: '0 2 * * *' # Run daily at 2 AM UTC | |
workflow_dispatch: | |
inputs: | |
suite: | |
description: 'Eval suite to run (leave empty for basic suite)' | |
required: false | |
models: | |
description: 'Comma-separated list of models to evaluate' | |
required: false | |
timeout: | |
description: 'Timeout for each eval in seconds' | |
type: number | |
required: false | |
image-suffix: | |
type: choice | |
description: 'Suffix to use for docker image' | |
options: | |
- '' | |
- '-full' | |
jobs: | |
run-evals: | |
runs-on: ubuntu-latest | |
env: | |
OWNER: ${{ github.repository_owner }} | |
SUITE: ${{ github.event.inputs.suite || '' }} | |
MODELS: ${{ github.event.inputs.models || '' }} | |
TIMEOUT: ${{ github.event.inputs.timeout || '60' }} | |
IMAGE_SUFFIX: ${{ github.event.inputs.image-suffix || '' }} | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v3 | |
with: | |
ref: eval-results | |
- name: Log in to GitHub Container Registry | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Set environment variables | |
run: echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV} | |
- name: Pull Docker image | |
run: | | |
docker pull ghcr.io/${OWNER_LC}/gptme-eval${IMAGE_SUFFIX}:latest | |
- name: Write gptme config.toml | |
shell: bash | |
env: | |
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
run: | | |
mkdir -p ~/.config/gptme | |
cat > ~/.config/gptme/config.toml << EOF | |
[prompt] | |
about_user = "I am a curious human programmer." | |
response_preference = "Don't explain basic concepts" | |
[env] | |
OPENAI_API_KEY = "$OPENAI_API_KEY" | |
ANTHROPIC_API_KEY = "$ANTHROPIC_API_KEY" | |
OPENROUTER_API_KEY = "$OPENROUTER_API_KEY" | |
EOF | |
cat ~/.config/gptme/config.toml | |
chmod -R o=rwx ~/.config/gptme | |
- name: Run evals | |
run: | | |
MODEL_ARGS="" | |
if [ -n "$MODELS" ]; then | |
IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" | |
for MODEL in "${MODEL_ARRAY[@]}"; do | |
MODEL_ARGS="$MODEL_ARGS -m $MODEL" | |
done | |
fi | |
echo "Running evals for suite: ${SUITE:-basic}" | |
echo "Models: ${MODELS:-default}" | |
echo "Timeout: ${TIMEOUT} seconds" | |
echo "Image suffix: ${IMAGE_SUFFIX}" | |
RESULTS_DIR=$(pwd)/eval_results | |
echo "Results dir: $RESULTS_DIR" | |
mkdir -p $RESULTS_DIR | |
chmod -R o=rwx $RESULTS_DIR # make sure docker can read/write the dir | |
docker run \ | |
-v ~/.config/gptme:/home/appuser/.config/gptme \ | |
-v $RESULTS_DIR:/app/eval_results \ | |
ghcr.io/${OWNER_LC}/gptme-eval${IMAGE_SUFFIX}:latest \ | |
${SUITE:+"$SUITE"} \ | |
--timeout "$TIMEOUT" \ | |
$MODEL_ARGS | |
- name: Configure Git | |
run: | | |
git config --local user.email "action@github.com" | |
git config --local user.name "GitHub Action" | |
- name: Update eval results | |
run: | | |
git pull origin eval-results | |
git add -f eval_results | |
git commit -m "chore: add eval results for run ${{ github.run_number }} [skip ci]" || echo "No changes to commit" | |
git push origin eval-results |