Skip to content

Optimum neuron LLM inference cache builder #118

Optimum neuron LLM inference cache builder

Optimum neuron LLM inference cache builder #118

name: Optimum neuron LLM inference cache builder
on:
workflow_dispatch:
schedule:
# Schedule the workflow to run every day at midnight UTC
- cron: '0 0 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
jobs:
cache:
name: Create optimum-neuron inference cache
runs-on:
group: aws-inf2-48xlarge
env:
AWS_REGION: us-east-1
strategy:
fail-fast: false
matrix:
config: [
gpt2,
llama,
qwen2.5,
llama3.1-70b,
qwen2.5-large,
mistral,
llama-variants,
mistral-variants,
mixtral
]
steps:
- name: Install Neuron runtime
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.19.0-5856c0b42 aws-neuronx-collectives=2.22.33.0-d2128d1aa -y
export PATH=/opt/aws/neuron/bin:$PATH
- name: Checkout
uses: actions/checkout@v4
- name: Install python and create venv
run: |
sudo apt install python3-venv python3-dev -y
python3 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
python -m pip install -U pip
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
- name: Install optimum neuron
run: |
source aws_neuron_venv_pytorch/bin/activate
python -m pip install .[neuronx]
- name: Create cache for ${{matrix.config}} models
run: |
source aws_neuron_venv_pytorch/bin/activate
config_prefix_url=https://huggingface.co/aws-neuron/optimum-neuron-cache/raw/main/inference-cache-config
HF_TOKEN=${{secrets.HF_TOKEN_OPTIMUM_NEURON_CACHE}} \
python tools/auto_fill_inference_cache.py --config_file ${config_prefix_url}/${{matrix.config}}.json