ultralytics/setup_encoder_distill.sh

#!/bin/bash
# Setup for YOLO universal encoder distillation (repos + deps + teacher weights)
# Run this first. Dataset download is separate (get_datacomp12m.sh).
#
# Teachers load via native Python (not TorchScript .ts) because EUPE/DINOv3 RoPE
# position encoding hardcodes device in torch.arange, making traced graphs non-portable.
set -e
cd /home/fatih/dev/ultralytics

echo "=== Step 1: Clone reference repos ==="
[ -d /home/fatih/dev/eupe ] || git clone https://github.com/facebookresearch/eupe /home/fatih/dev/eupe
[ -d /home/fatih/dev/RADIO ] || git clone https://github.com/NVlabs/RADIO /home/fatih/dev/RADIO
[ -d /home/fatih/dev/unic ] || git clone https://github.com/naver/unic /home/fatih/dev/unic
[ -d /home/fatih/dev/dune ] || git clone https://github.com/naver/dune /home/fatih/dev/dune

echo "=== Step 2: Install dependencies ==="
source .venv/bin/activate
uv pip install \
    "transformers==5.5.0" \
    "webdataset==0.2.111" \
    "img2dataset" \
    "huggingface_hub>=1.8.0"

echo "=== Step 3: Download EUPE weights (3 variants) ==="
python -c "
from huggingface_hub import hf_hub_download
for repo, fname in [
    ('facebook/EUPE-ViT-B', 'EUPE-ViT-B.pt'),
    ('facebook/EUPE-ViT-S', 'EUPE-ViT-S.pt'),
    ('facebook/EUPE-ConvNeXt-B', 'EUPE-ConvNeXt-B.pt'),
]:
    path = hf_hub_download(repo, fname)
    print(f'  {repo} -> {path}')
"

echo "=== Step 4: Pre-download DINOv3 + SigLIP2 weights ==="
python -c "
from transformers import AutoModel, SiglipVisionModel

for name in [
    'facebook/dinov3-vitb16-pretrain-lvd1689m',
    'facebook/dinov3-vitl16-pretrain-lvd1689m',
    'facebook/dinov3-convnext-base-pretrain-lvd1689m',
]:
    AutoModel.from_pretrained(name)
    print(f'  {name} OK')

SiglipVisionModel.from_pretrained('google/siglip2-giant-opt-patch16-384')
print('  google/siglip2-giant-opt-patch16-384 OK')
"

echo "=== Step 5: Verify all teachers ==="
python -c "
import torch
from ultralytics.nn.teacher_model import build_teacher_model, TEACHER_REGISTRY

# Skip vit7b (26GB, slow to load) and sam3:l (1024px, needs lots of RAM)
for variant in ['eupe:vitb16', 'eupe:vits16', 'eupe:convnextb',
                'dinov3:vitb16', 'dinov3:vitl16', 'dinov3:convnextb',
                'siglip2:g']:
    reg = TEACHER_REGISTRY[variant]
    teacher = build_teacher_model(variant, torch.device('cpu'))
    dummy = torch.randn(1, 3, reg['imgsz'], reg['imgsz'])
    with torch.no_grad():
        out = teacher.encode(dummy)
    cls_shape = out.cls.shape if out.cls is not None else None
    print(f'  {variant}: cls={cls_shape}, patches={out.patches.shape}')
    del teacher
print('All teachers verified OK')
"

echo "Setup complete! Run get_datacomp12m.sh next for dataset."