ultralytics/setup_encoder_distill.sh
Fatih Akyon 0dddc4e428
feat: add encoder distillation environment setup script
Clone reference repos (EUPE, RADIO, UNIC, DUNE), install pinned deps
(transformers==5.5.0, webdataset==0.2.111), download teacher weights,
and verify all 7 teacher variants load correctly. Also ignore .codex/ and .agents/.
2026-04-11 04:35:29 -05:00

73 lines
2.6 KiB
Bash
Executable file

#!/bin/bash
# Setup for YOLO universal encoder distillation (repos + deps + teacher weights)
# Run this first. Dataset download is separate (get_datacomp12m.sh).
#
# Teachers load via native Python (not TorchScript .ts) because EUPE/DINOv3 RoPE
# position encoding hardcodes device in torch.arange, making traced graphs non-portable.
set -e
cd /home/fatih/dev/ultralytics
echo "=== Step 1: Clone reference repos ==="
[ -d /home/fatih/dev/eupe ] || git clone https://github.com/facebookresearch/eupe /home/fatih/dev/eupe
[ -d /home/fatih/dev/RADIO ] || git clone https://github.com/NVlabs/RADIO /home/fatih/dev/RADIO
[ -d /home/fatih/dev/unic ] || git clone https://github.com/naver/unic /home/fatih/dev/unic
[ -d /home/fatih/dev/dune ] || git clone https://github.com/naver/dune /home/fatih/dev/dune
echo "=== Step 2: Install dependencies ==="
source .venv/bin/activate
uv pip install \
"transformers==5.5.0" \
"webdataset==0.2.111" \
"img2dataset" \
"huggingface_hub>=1.8.0"
echo "=== Step 3: Download EUPE weights (3 variants) ==="
python -c "
from huggingface_hub import hf_hub_download
for repo, fname in [
('facebook/EUPE-ViT-B', 'EUPE-ViT-B.pt'),
('facebook/EUPE-ViT-S', 'EUPE-ViT-S.pt'),
('facebook/EUPE-ConvNeXt-B', 'EUPE-ConvNeXt-B.pt'),
]:
path = hf_hub_download(repo, fname)
print(f' {repo} -> {path}')
"
echo "=== Step 4: Pre-download DINOv3 + SigLIP2 weights ==="
python -c "
from transformers import AutoModel, SiglipVisionModel
for name in [
'facebook/dinov3-vitb16-pretrain-lvd1689m',
'facebook/dinov3-vitl16-pretrain-lvd1689m',
'facebook/dinov3-convnext-base-pretrain-lvd1689m',
]:
AutoModel.from_pretrained(name)
print(f' {name} OK')
SiglipVisionModel.from_pretrained('google/siglip2-giant-opt-patch16-384')
print(' google/siglip2-giant-opt-patch16-384 OK')
"
echo "=== Step 5: Verify all teachers ==="
python -c "
import torch
from ultralytics.nn.teacher_model import build_teacher_model, TEACHER_REGISTRY
# Skip vit7b (26GB, slow to load) and sam3:l (1024px, needs lots of RAM)
for variant in ['eupe:vitb16', 'eupe:vits16', 'eupe:convnextb',
'dinov3:vitb16', 'dinov3:vitl16', 'dinov3:convnextb',
'siglip2:g']:
reg = TEACHER_REGISTRY[variant]
teacher = build_teacher_model(variant, torch.device('cpu'))
dummy = torch.randn(1, 3, reg['imgsz'], reg['imgsz'])
with torch.no_grad():
out = teacher.encode(dummy)
cls_shape = out.cls.shape if out.cls is not None else None
print(f' {variant}: cls={cls_shape}, patches={out.patches.shape}')
del teacher
print('All teachers verified OK')
"
echo "Setup complete! Run get_datacomp12m.sh next for dataset."