-
Notifications
You must be signed in to change notification settings - Fork 713
Expand file tree
/
Copy patheval.ts
More file actions
116 lines (116 loc) · 4.89 KB
/
eval.ts
File metadata and controls
116 lines (116 loc) · 4.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/**
* List of supported Evaluation Frameworks supported in the `eval.yaml` file in benchmarks datasets.
*/
export const EVALUATION_FRAMEWORKS = {
exgentic: {
name: "exgentic",
description:
"Exgentic is an open evaluation framework for general-purpose AI agents across diverse domains and benchmarks.",
url: "https://github.com/Exgentic/exgentic",
},
"inspect-ai": {
name: "inspect-ai",
description: "Inspect AI is an open-source framework for large language model evaluations.",
url: "https://inspect.aisi.org.uk/",
},
"math-arena": {
name: "math-arena",
description: "MathArena is a platform for evaluation of LLMs on latest math competitions and olympiads.",
url: "https://github.com/eth-sri/matharena",
},
mteb: {
name: "mteb",
description: "Multimodal toolbox for evaluating embeddings and retrieval systems.",
url: "https://github.com/embeddings-benchmark/mteb",
},
"olmocr-bench": {
name: "olmocr-bench",
description: "olmOCR-Bench is a framework for evaluating document-level OCR of various tools.",
url: "https://github.com/allenai/olmocr/tree/main/olmocr/bench",
},
harbor: {
name: "harbor",
description: "Harbor is a framework for evaluating and optimizing agents and language models.",
url: "https://github.com/laude-institute/harbor",
},
archipelago: {
name: "archipelago",
description: "Archipelago is a system for running and evaluating AI agents against MCP applications.",
url: "https://github.com/Mercor-Intelligence/archipelago",
},
"apex-evals": {
name: "apex-evals",
description: "APEX Evals is a benchmark suite and evaluation harness for evaluating large language models.",
url: "https://github.com/Mercor-Intelligence/apex-evals",
},
"screenspot-pro": {
name: "screenspot-pro",
description:
"ScreenSpot-Pro is a GUI grounding benchmark designed to evaluate how well AI agents can locate and identify UI elements across professional software applications in high-resolution screenshots, covering 1,585 annotated images from 26 professional tools.",
url: "https://github.com/likaixin2000/ScreenSpot-Pro-GUI-Grounding",
},
"swe-bench": {
name: "swe-bench",
description: "SWE Bench is a framework for evaluating the performance of LLMs on software engineering tasks.",
url: "https://github.com/swe-bench/swe-bench",
},
"swe-bench-pro": {
name: "swe-bench-pro",
description:
"SWE-Bench Pro is a challenging benchmark evaluating LLMs/Agents on long-horizon software engineering tasks.",
url: "https://github.com/scaleapi/SWE-bench_Pro-os",
},
"nemo-evaluator": {
name: "nemo-evaluator",
description:
"NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models across 100+ benchmarks.",
url: "https://github.com/NVIDIA-NeMo/Evaluator",
},
"yc-bench": {
name: "yc-bench",
description:
"YC Bench is a long-horizon deterministic benchmark for LLM agents. The agent plays CEO of an AI startup over a simulated 1–3 year run.",
url: "https://github.com/collinear-ai/yc-bench",
},
"open-asr-leaderboard": {
name: "open-asr-leaderboard",
description: "The Open ASR Leaderboard ranks and evaluates speech recognition models.",
url: "https://github.com/huggingface/open_asr_leaderboard",
},
mdpbench: {
name: "mdpbench",
description:
"MDPBench is a benchmark for evaluating multilingual document parsing across digital, photographed, Latin, and non-Latin document subsets.",
url: "https://github.com/Yuliang-Liu/MultimodalOCR",
},
parsebench: {
name: "parsebench",
description:
"ParseBench is a benchmark for evaluating document parsing systems on real-world enterprise documents across tables, charts, content faithfulness, semantic formatting, and visual grounding.",
url: "https://github.com/run-llama/ParseBench",
},
"video-mme-v2": {
name: "video-mme-v2",
description:
"Video-MME-v2 is a benchmark for evaluating the next stage of video understanding capabilities of multimodal large language models.",
url: "https://github.com/MME-Benchmarks/Video-MME-v2",
},
"claw-eval": {
name: "claw-eval",
description:
"CLAW-Eval is an evaluation framework for assessing LLMs as autonomous agents across 300 human-verified tasks covering communication, finance, and productivity domains.",
url: "https://github.com/claw-eval/claw-eval",
},
pbench: {
name: "pbench",
description:
"PBench is a multi-level referring expression segmentation benchmark for evaluating vision-language perception across a structured hierarchy of skills.",
url: "https://github.com/tiiuae/Falcon-Perception",
},
wildclawbench: {
name: "wildclawbench",
description:
"WildClawBench is an in-the-wild benchmark for evaluating AI agents in the OpenClaw environment across 60 hand-built, end-to-end tasks spanning productivity, code intelligence, social interaction, search, creative synthesis, and safety domains.",
url: "https://github.com/InternLM/WildClawBench",
},
} as const;