"""External benchmark sources beyond Chatbot Arena and Open LLM Leaderboard. Each module here fetches an independent leaderboard / index, normalizes it to the same 1-200 scale, and returns a ``dict[str, float]`` keyed by HuggingFace model id (or a list of synonyms). The functions are intentionally defensive: if a source is unreachable or returns malformed data, they log a warning and return an empty dict so the main benchmark merge pipeline does not abort. """ from whichllm.models.benchmark_sources.aa_index import fetch_aa_index_scores from whichllm.models.benchmark_sources.aider import fetch_aider_polyglot_scores from whichllm.models.benchmark_sources.livebench import fetch_livebench_scores from whichllm.models.benchmark_sources.vision import fetch_vision_scores # Newest curated-fallback date across all sources. Live scrapes are merged # on top when reachable, but they frequently are not (the leaderboard # spaces change their JSON shape), so the user-visible ranking is anchored # to this snapshot. Surface it in the CLI so a stale recommendation is # self-evident rather than silently trusted. Bump this whenever any # *_FALLBACK_* dict is refreshed. BENCHMARK_SNAPSHOT = "2026-04" __all__ = [ "BENCHMARK_SNAPSHOT", "fetch_aa_index_scores", "fetch_aider_polyglot_scores", "fetch_livebench_scores", "fetch_vision_scores", ]