(.*?)

# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license """ Automates building and post-processing of MkDocs documentation, especially for multilingual projects. This script streamlines generating localized documentation and updating HTML links for correct formatting. Key Features: - Automated building of MkDocs documentation: Compiles main documentation and localized versions from separate MkDocs configuration files. - Post-processing of generated HTML files: Updates HTML files to remove '.md' from internal links, ensuring correct navigation in web-based documentation. Usage: - Run from the root directory of your MkDocs project. - Ensure MkDocs is installed and configuration files (main and localized) are present. - The script builds documentation using MkDocs, then scans HTML files in 'site' to update links. - Ideal for projects with Markdown documentation served as a static website. Note: - Requires Python and MkDocs to be installed and configured. """ from __future__ import annotations import os import re import shutil import subprocess import sys import tempfile import time from pathlib import Path import yaml from bs4 import BeautifulSoup from minijinja import Environment, load_from_path try: from plugin import postprocess_site # mkdocs-ultralytics-plugin except ImportError: postprocess_site = None from build_reference import build_reference_docs from ultralytics.utils import LINUX, LOGGER, MACOS from ultralytics.utils.tqdm import TQDM os.environ["JUPYTER_PLATFORM_DIRS"] = "1" # fix DeprecationWarning: Jupyter is migrating to use standard platformdirs DOCS = Path(__file__).parent.resolve() SITE = DOCS.parent / "site" LINK_PATTERN = re.compile(r"(https?://[^\s()<>]*[^\s()<>.,:;!?\'\"])") TITLE_PATTERN = re.compile(r"(.*?)", flags=re.IGNORECASE | re.DOTALL) MD_LINK_PATTERN = re.compile(r'(["\']?)([^"\'>\s]+?)\.md(["\']?)') DOC_KIND_LABELS = {"Class", "Function", "Method", "Property"} DOC_KIND_COLORS = { "Class": "#039dfc", # blue "Method": "#ef5eff", # magenta "Function": "#fc9803", # orange "Property": "#02e835", # green } def prepare_docs_markdown(clone_repos: bool = True): """Build docs using mkdocs.""" LOGGER.info("Removing existing build artifacts") shutil.rmtree(SITE, ignore_errors=True) shutil.rmtree(DOCS / "repos", ignore_errors=True) if clone_repos: # Get docs repo repo = "https://github.com/ultralytics/docs" local_dir = DOCS / "repos" / Path(repo).name subprocess.run( ["git", "clone", "-q", "--depth=1", "--single-branch", "-b", "main", repo, str(local_dir)], check=True ) shutil.rmtree(DOCS / "en/compare", ignore_errors=True) # delete if exists shutil.copytree(local_dir / "docs/en/compare", DOCS / "en/compare") # for docs LOGGER.info(f"Cloned/Updated {repo} in {local_dir}") # Add frontmatter for file in TQDM((DOCS / "en").rglob("*.md"), desc="Adding frontmatter"): update_markdown_files(file) def update_markdown_files(md_filepath: Path): """Create or update a Markdown file, ensuring frontmatter is present.""" if md_filepath.exists(): content = md_filepath.read_text().strip() # Replace apostrophes content = content.replace("‘", "'").replace("’", "'") # Add frontmatter if missing if not content.strip().startswith("---\n"): header = "---\ncomments: true\ndescription: TODO ADD DESCRIPTION\nkeywords: TODO ADD KEYWORDS\n---\n\n" content = header + content # Ensure MkDocs admonitions "=== " lines are preceded and followed by empty newlines lines = content.split("\n") new_lines = [] for i, line in enumerate(lines): stripped_line = line.strip() if stripped_line.startswith("=== "): if i > 0 and new_lines[-1] != "": new_lines.append("") new_lines.append(line) if i < len(lines) - 1 and lines[i + 1].strip() != "": new_lines.append("") else: new_lines.append(line) content = "\n".join(new_lines) # Add EOF newline if missing if not content.endswith("\n"): content += "\n" # Save page md_filepath.write_text(content) return def update_docs_html(): """Update titles, edit links, and convert plaintext links in HTML documentation in one pass.""" from concurrent.futures import ProcessPoolExecutor html_files = list(SITE.rglob("*.html")) if not html_files: LOGGER.info("Updated HTML files: 0") return desc = f"Updating HTML at {SITE}" max_workers = os.cpu_count() or 1 with ProcessPoolExecutor(max_workers=max_workers) as executor: pbar = TQDM(executor.map(_process_html_file, html_files), total=len(html_files), desc=desc) updated = 0 for res in pbar: updated += bool(res) pbar.set_description(f"{desc} ({updated}/{len(html_files)} updated)") def _process_html_file(html_file: Path) -> bool: """Process a single HTML file; returns True if modified.""" try: content = html_file.read_text(encoding="utf-8") except Exception as e: LOGGER.warning(f"Could not read {html_file}: {e}") return False changed = False try: rel_path = html_file.relative_to(SITE).as_posix() except ValueError: rel_path = html_file.name # For pages sourced from external repos (compare), drop edit/copy buttons to avoid wrong links if rel_path.startswith("compare/"): before = content content = re.sub( r']*class="[^"]*md-content__button[^"]*"[^>]*>.*?', "", content, flags=re.IGNORECASE | re.DOTALL, ) if content != before: changed = True if rel_path == "404.html": new_content = re.sub(r".*?", "Ultralytics Docs - Not Found", content) if new_content != content: content, changed = new_content, True new_content = update_docs_soup(content, html_file=html_file) if new_content != content: content, changed = new_content, True new_content = _rewrite_md_links(content) if new_content != content: content, changed = new_content, True if changed: try: html_file.write_text(content, encoding="utf-8") return True except Exception as e: LOGGER.warning(f"Could not write {html_file}: {e}") return False def update_docs_soup(content: str, html_file: Path | None = None, max_title_length: int = 70) -> str: """Convert plaintext links to HTML hyperlinks, truncate long meta titles, and remove code line hrefs.""" title_match = TITLE_PATTERN.search(content) needs_title_trim = bool( title_match and len(title_match.group(1)) > max_title_length and "-" in title_match.group(1) ) needs_link_conversion = (" max_title_length and "-" in title_tag.text: title_tag.string = title_tag.text.rsplit("-", 1)[0].strip() modified = True # Find the main content area main_content = soup.find("main") or soup.find("div", class_="md-content") if not main_content: return str(soup) if modified else content # Convert plaintext links to HTML hyperlinks if needs_link_conversion: for paragraph in main_content.select("p, li"): for text_node in paragraph.find_all(string=True, recursive=False): if text_node.parent.name not in {"a", "code"}: new_text = LINK_PATTERN.sub(r'\1', str(text_node)) if " 0: tail = " " if tail: span.insert_after(tail) modified = True if "reference" in rel_path: highlight_labels(soup.select("main h1, main h2, main h3, main h4, main h5")) highlight_labels(soup.select("nav.md-nav--secondary .md-ellipsis, nav.md-nav__list .md-ellipsis")) if "reference" in rel_path: for ellipsis in soup.select("nav.md-nav--secondary .md-ellipsis"): kind = ellipsis.find(class_=lambda c: c and "doc-kind" in c.split()) text = str(kind.next_sibling).strip() if kind and kind.next_sibling else ellipsis.get_text(strip=True) if "." not in text: continue ellipsis.clear() short = text.rsplit(".", 1)[-1] if kind: ellipsis.append(kind) ellipsis.append(f" {short}") else: ellipsis.append(short) modified = True if needs_kind_highlight and not modified and soup.select(".doc-kind"): # Ensure style injection when pre-existing badges are present modified = True if modified: head = soup.find("head") if head and not soup.select("style[data-doc-kind]"): style = soup.new_tag("style", attrs={"data-doc-kind": "true"}) style.string = ( ".doc-kind{display:inline-flex;align-items:center;gap:0.25em;padding:0.21em 0.59em;border-radius:999px;" "font-weight:700;font-size:0.81em;letter-spacing:0.06em;text-transform:uppercase;" "line-height:1;color:var(--doc-kind-color,#f8fafc);" "background:var(--doc-kind-bg,rgba(255,255,255,0.12));}" f".doc-kind-class{{--doc-kind-color:{DOC_KIND_COLORS['Class']};--doc-kind-bg:rgba(3,157,252,0.22);}}" f".doc-kind-function{{--doc-kind-color:{DOC_KIND_COLORS['Function']};--doc-kind-bg:rgba(252,152,3,0.22);}}" f".doc-kind-method{{--doc-kind-color:{DOC_KIND_COLORS['Method']};--doc-kind-bg:rgba(239,94,255,0.22);}}" f".doc-kind-property{{--doc-kind-color:{DOC_KIND_COLORS['Property']};--doc-kind-bg:rgba(2,232,53,0.22);}}" ) head.append(style) return str(soup) if modified else content def _rewrite_md_links(content: str) -> str: """Replace .md references with trailing slashes in HTML content, skipping GitHub links.""" if ".md" not in content: return content lines = [] for line in content.split("\n"): if "github.com" not in line: line = line.replace("index.md", "") line = MD_LINK_PATTERN.sub(r"\1\2/\3", line) lines.append(line) return "\n".join(lines) # Precompiled regex patterns for minification HTML_COMMENT = re.compile(r"") HTML_PRESERVE = re.compile(r"<(pre|code|textarea|script)[^>]*>[\s\S]*?", re.IGNORECASE) HTML_TAG_SPACE = re.compile(r">\s+<") HTML_MULTI_SPACE = re.compile(r"\s{2,}") HTML_EMPTY_LINE = re.compile(r"^\s*$\n", re.MULTILINE) CSS_COMMENT = re.compile(r"/\*[\s\S]*?\*/") def remove_comments_and_empty_lines(content: str, file_type: str) -> str: """Remove comments and empty lines from a string of code, preserving newlines and URLs. Args: content (str): Code content to process. file_type (str): Type of file ('html', 'css', or 'js'). Returns: (str): Cleaned content with comments and empty lines removed. Notes: Typical reductions for Ultralytics Docs are: - Total HTML reduction: 2.83% (1301.56 KB saved) - Total CSS reduction: 1.75% (2.61 KB saved) - Total JS reduction: 13.51% (99.31 KB saved) """ if file_type == "html": content = HTML_COMMENT.sub("", content) # Remove HTML comments # Preserve whitespace in

, ,  tags
        preserved = []

        def preserve(match):
            """Mark HTML blocks that should not be minified."""
            preserved.append(match.group(0))
            return f"___PRESERVE_{len(preserved) - 1}___"

        content = HTML_PRESERVE.sub(preserve, content)
        content = HTML_TAG_SPACE.sub("><", content)  # Remove whitespace between tags
        content = HTML_MULTI_SPACE.sub(" ", content)  # Collapse multiple spaces
        content = HTML_EMPTY_LINE.sub("", content)  # Remove empty lines
        # Restore preserved content
        for i, text in enumerate(preserved):
            content = content.replace(f"___PRESERVE_{i}___", text)
    elif file_type == "css":
        content = CSS_COMMENT.sub("", content)  # Remove CSS comments
        # Remove whitespace around specific characters
        content = re.sub(r"\s*([{}:;,])\s*", r"\1", content)
        # Remove empty lines
        content = re.sub(r"^\s*\n", "", content, flags=re.MULTILINE)
        # Collapse multiple spaces to single space
        content = re.sub(r"\s{2,}", " ", content)
        # Remove all newlines
        content = re.sub(r"\n", "", content)
    elif file_type == "js":
        # Handle JS single-line comments (preserving http:// and https://)
        lines = content.split("\n")
        processed_lines = []
        for line in lines:
            # Only remove comments if they're not part of a URL
            if "//" in line and "http://" not in line and "https://" not in line:
                processed_lines.append(line.partition("//")[0])
            else:
                processed_lines.append(line)
        content = "\n".join(processed_lines)

        # Remove JS multi-line comments and clean whitespace
        content = re.sub(r"/\*[\s\S]*?\*/", "", content)
        # Remove empty lines
        content = re.sub(r"^\s*\n", "", content, flags=re.MULTILINE)
        # Collapse multiple spaces to single space
        content = re.sub(r"\s{2,}", " ", content)

        # Safe space removal around punctuation and operators (never include colons - breaks JS)
        content = re.sub(r"\s*([;{}])\s*", r"\1", content)
        content = re.sub(r"(\w)\s*\(|\)\s*{|\s*([+\-*/=])\s*", lambda m: m.group(0).replace(" ", ""), content)

    return content


def minify_files(html: bool = True, css: bool = True, js: bool = True):
    """Minify HTML, CSS, and JS files and print total reduction stats."""
    minify, compress, jsmin = None, None, None
    try:
        if html:
            from minify_html import minify
        if css:
            from csscompressor import compress
        if js:
            import jsmin
    except ImportError as e:
        LOGGER.info(f"Missing required package: {e}")
        return

    stats = {}
    for ext, minifier in {
        "html": (lambda x: minify(x, keep_closing_tags=True, minify_css=True, minify_js=True)) if html else None,
        "css": compress if css else None,
        "js": jsmin.jsmin if js else None,
    }.items():
        orig = minified = 0
        files = list(SITE.rglob(f"*.{ext}"))
        if not files:
            continue
        pbar = TQDM(files, desc=f"Minifying {ext.upper()} - reduced 0.00% (0.00 KB saved)")
        for f in pbar:
            content = f.read_text(encoding="utf-8")
            out = minifier(content) if minifier else remove_comments_and_empty_lines(content, ext)
            orig += len(content)
            minified += len(out)
            f.write_text(out, encoding="utf-8")
            saved = orig - minified
            pct = (saved / orig) * 100 if orig else 0.0
            pbar.set_description(f"Minifying {ext.upper()} - reduced {pct:.2f}% ({saved / 1024:.2f} KB saved)")
        stats[ext] = {"original": orig, "minified": minified}


def render_jinja_macros() -> None:
    """Render MiniJinja macros in Markdown files before building with MkDocs."""
    mkdocs_yml = DOCS.parent / "mkdocs.yml"
    default_yaml = DOCS.parent / "ultralytics" / "cfg" / "default.yaml"

    class SafeFallbackLoader(yaml.SafeLoader):
        """SafeLoader that gracefully skips unknown tags (required for mkdocs.yml)."""

    def _ignore_unknown(loader, tag_suffix, node):
        """Gracefully handle YAML tags that aren't registered."""
        if isinstance(node, yaml.ScalarNode):
            return loader.construct_scalar(node)
        if isinstance(node, yaml.SequenceNode):
            return loader.construct_sequence(node)
        if isinstance(node, yaml.MappingNode):
            return loader.construct_mapping(node)
        return None

    SafeFallbackLoader.add_multi_constructor("", _ignore_unknown)

    def load_yaml(path: Path, *, safe_loader: yaml.Loader = yaml.SafeLoader) -> dict:
        """Load YAML safely, returning an empty dict on errors."""
        if not path.exists():
            return {}
        try:
            with open(path, encoding="utf-8") as f:
                return yaml.load(f, Loader=safe_loader) or {}
        except Exception as e:
            LOGGER.warning(f"Could not load {path}: {e}")
            return {}

    mkdocs_cfg = load_yaml(mkdocs_yml, safe_loader=SafeFallbackLoader)
    extra_vars = mkdocs_cfg.get("extra", {}) or {}
    site_name = mkdocs_cfg.get("site_name", "Ultralytics Docs")
    extra_vars.update(load_yaml(default_yaml))

    env = Environment(
        loader=load_from_path([DOCS / "en", DOCS]),
        auto_escape_callback=lambda _: False,
        trim_blocks=True,
        lstrip_blocks=True,
        keep_trailing_newline=True,
    )

    def indent_filter(value: str, width: int = 4, first: bool = False, blank: bool = False) -> str:
        """Mimic Jinja's indent filter to preserve macros compatibility."""
        prefix = " " * int(width)
        result = []
        for i, line in enumerate(str(value).splitlines(keepends=True)):
            if not line.strip() and not blank:
                result.append(line)
                continue
            if i == 0 and not first:
                result.append(line)
            else:
                result.append(prefix + line)
        return "".join(result)

    env.add_filter("indent", indent_filter)
    reserved_keys = {"name"}
    base_context = {**extra_vars, "page": {"meta": {}}, "config": {"site_name": site_name}}

    files_processed = 0
    files_with_macros = 0
    macros_total = 0

    pbar = TQDM((DOCS / "en").rglob("*.md"), desc="MiniJinja: 0 macros, 0 pages")
    for md_file in pbar:
        if "macros" in md_file.parts or "reference" in md_file.parts:
            continue
        files_processed += 1

        try:
            content = md_file.read_text(encoding="utf-8")
        except Exception as e:
            LOGGER.warning(f"Could not read {md_file}: {e}")
            continue
        if "{{" not in content and "{%" not in content:
            continue

        parts = content.split("---\n")
        frontmatter = ""
        frontmatter_data = {}
        markdown_content = content
        if content.startswith("---\n") and len(parts) >= 3:
            frontmatter = f"---\n{parts[1]}---\n"
            markdown_content = "---\n".join(parts[2:])
            try:
                frontmatter_data = yaml.safe_load(parts[1]) or {}
            except Exception as e:
                LOGGER.warning(f"Could not parse frontmatter in {md_file}: {e}")

        macro_hits = markdown_content.count("{{") + markdown_content.count("{%")
        if not macro_hits:
            continue

        context = {k: v for k, v in base_context.items() if k not in reserved_keys}
        context.update({k: v for k, v in frontmatter_data.items() if k not in reserved_keys})
        context["page"] = context.get("page", {})
        context["page"]["meta"] = frontmatter_data

        try:
            rendered = env.render_str(markdown_content, name=str(md_file.relative_to(DOCS)), **context)
        except Exception as e:
            LOGGER.warning(f"Error rendering macros in {md_file}: {e}")
            continue

        md_file.write_text(frontmatter + rendered, encoding="utf-8")
        files_with_macros += 1
        macros_total += macro_hits
        pbar.set_description(f"MiniJinja: {macros_total} macros, {files_with_macros} pages")


def backup_docs_sources() -> tuple[Path, list[tuple[Path, Path]]]:
    """Create a temporary backup of docs sources so we can fully restore after building."""
    backup_root = Path(tempfile.mkdtemp(prefix="docs_backup_", dir=str(DOCS.parent)))
    sources = [DOCS / "en", DOCS / "macros"]
    copied: list[tuple[Path, Path]] = []
    for src in sources:
        if not src.exists():
            continue
        dst = backup_root / src.name
        shutil.copytree(src, dst)
        copied.append((src, dst))
    return backup_root, copied


def restore_docs_sources(backup_root: Path, backups: list[tuple[Path, Path]]):
    """Restore docs sources from the temporary backup."""
    for src, dst in backups:
        shutil.rmtree(src, ignore_errors=True)
        if dst.exists():
            shutil.copytree(dst, src)
    shutil.rmtree(backup_root, ignore_errors=True)


def main():
    """Build docs, update titles and edit links, minify HTML, and print local server command."""
    if not shutil.which("zensical"):
        raise SystemExit("zensical is not installed. Install it with: pip install -e '.[dev]'")

    start_time = time.perf_counter()
    backup_root: Path | None = None
    docs_backups: list[tuple[Path, Path]] = []
    restored = False

    def restore_all():
        """Restore docs sources from backup once build steps complete."""
        nonlocal restored
        if backup_root:
            LOGGER.info("Restoring docs directory from backup")
            restore_docs_sources(backup_root, docs_backups)
        restored = True

    try:
        backup_root, docs_backups = backup_docs_sources()
        prepare_docs_markdown()
        build_reference_docs(update_nav=False)
        render_jinja_macros()

        # Remove cloned repos before serving/building to keep the tree lean during mkdocs processing
        shutil.rmtree(DOCS / "repos", ignore_errors=True)

        # Build the main documentation
        LOGGER.info(f"Building docs from {DOCS}")
        subprocess.run(["zensical", "build", "-f", str(DOCS.parent / "mkdocs.yml"), "--strict"], check=True)
        LOGGER.info(f"Site built at {SITE}")

        # Remove search index JSON files to disable search
        Path(SITE / "search.json").unlink(missing_ok=True)

        # Update docs HTML pages
        update_docs_html()

        # Post-process site for meta tags, authors, social cards, and mkdocstrings polish
        if postprocess_site:
            postprocess_site(
                site_dir=SITE,
                docs_dir=DOCS / "en",
                site_url="https://docs.ultralytics.com",
                default_image="https://raw.githubusercontent.com/ultralytics/assets/main/yolov8/banner-yolov8.png",
                default_author="glenn.jocher@ultralytics.com",
                add_desc=False,
                add_image=True,
                add_authors=True,
                add_json_ld=True,
                add_share_buttons=True,
                add_css=False,
                verbose=True,
            )
        else:
            LOGGER.warning("postprocess_site not available; skipping mkdocstrings postprocessing")

        # Minify files
        minify_files(html=False, css=False, js=False)

        # Add missing pages to sitemap
        sitemap = SITE / "sitemap.xml"
        if sitemap.exists():
            content = sitemap.read_text()
            in_sitemap = set(re.findall(r"<loc>([^<]+)</loc>", content))
            all_pages = {
                f"https://docs.ultralytics.com/{f.relative_to(SITE).as_posix().replace('index.html', '')}"
                for f in SITE.rglob("*.html")
                if f.name != "404.html"
            }
            if missing := (all_pages - in_sitemap):
                entries = "\n".join(f"  <url>\n    <loc>{u}</loc>\n  </url>" for u in sorted(missing))
                sitemap.write_text(content.replace("</urlset>", f"{entries}\n</urlset>"))
            LOGGER.info(
                f"{len(all_pages)}/{len(all_pages)} pages in sitemap.xml ✅ (+{len(missing)} added)"
                if missing
                else f"{len(in_sitemap)}/{len(all_pages)} pages in sitemap.xml ✅"
            )

        # Print results and auto-serve on macOS
        size = sum(f.stat().st_size for f in SITE.rglob("*") if f.is_file()) >> 20
        duration = time.perf_counter() - start_time
        LOGGER.info(f"Docs built correctly ✅ ({size:.1f}MB, {duration:.1f}s)")

        # Restore sources before optionally serving
        restore_all()

        if (MACOS or LINUX) and not os.getenv("GITHUB_ACTIONS"):
            import webbrowser

            url = "http://localhost:8000"
            LOGGER.info(f"Opening browser at {url}")
            webbrowser.open(url)
            try:
                subprocess.run([sys.executable, "-m", "http.server", "--directory", str(SITE), "8000"], check=True)
            except KeyboardInterrupt:
                LOGGER.info(f"\n✅ Server stopped. Restart at {url}")
            except Exception as e:
                if "Address already in use" in str(e):
                    LOGGER.info("Port 8000 in use; skipping auto-serve. Serve manually if needed.")
                else:
                    LOGGER.info(f"\n❌ Server failed: {e}")
        else:
            LOGGER.info('Serve site at http://localhost:8000 with "python -m http.server --directory site"')
    finally:
        if not restored:
            restore_all()
        shutil.rmtree(DOCS / "repos", ignore_errors=True)


if __name__ == "__main__":
    main()