diff --git a/index.html b/index.html
index 25e3ff3..0ad364f 100644
--- a/index.html
+++ b/index.html
@@ -167,16 +167,12 @@
ðŸ¤
- ✅
- SWE-agent + GPT 4 (1106)
+
+ Lingma Agent + Lingma SWE-GPT 7b (v0925)
-
ðŸ¤
- ✅
- SWE-agent + Claude 3 Opus
+
+ Lingma Agent + Lingma SWE-GPT 7b (v0918)
-
-
+
- 🔗
+
+ IBM SWE-1.0 (with open LLMs)
+
+ |
+ 23.67 |
+ 2024-10-16 |
+
+
+ ✓
+
+ |
+
+
+ ✓
|
- 🔗
+ 🔗
|
@@ -1995,9 +2313,7 @@ Leaderboard
2024-05-24 |
-
- 🔗
-
+ ✓
|
@@ -2027,16 +2343,12 @@ Leaderboard
| 2024-06-20 |
-
- 🔗
-
+ ✓
|
-
- 🔗
-
+ ✓
|
@@ -2059,9 +2371,7 @@ Leaderboard
| 2024-06-15 |
-
- 🔗
-
+ ✓
|
@@ -2091,16 +2401,12 @@ Leaderboard
| 2024-08-28 |
-
- 🔗
-
+ ✓
|
-
- 🔗
-
+ ✓
|
@@ -2123,9 +2429,7 @@ Leaderboard
| 2024-05-09 |
-
- 🔗
-
+ ✓
|
@@ -2155,9 +2459,7 @@ Leaderboard
| 2024-05-30 |
-
- 🔗
-
+ ✓
|
@@ -2187,16 +2489,12 @@ Leaderboard
| 2024-07-28 |
-
- 🔗
-
+ ✓
|
-
- 🔗
-
+ ✓
|
@@ -2221,16 +2519,12 @@ Leaderboard
| 2024-04-02 |
-
- 🔗
-
+ ✓
|
-
- 🔗
-
+ ✓
|
@@ -2255,16 +2549,12 @@ Leaderboard
| 2024-04-02 |
-
- 🔗
-
+ ✓
|
-
- 🔗
-
+ ✓
|
@@ -2287,9 +2577,7 @@ Leaderboard
| 2024-04-02 |
-
- 🔗
-
+ ✓
|
@@ -2319,14 +2607,12 @@ Leaderboard
| 2023-10-10 |
-
- 🔗
-
+ ✓
|
- -
+ ✓
|
@@ -2349,9 +2635,7 @@ Leaderboard
| 2024-04-02 |
-
- 🔗
-
+ ✓
|
@@ -2379,9 +2663,7 @@ Leaderboard
| 2023-10-10 |
-
- 🔗
-
+ ✓
|
@@ -2409,9 +2691,7 @@ Leaderboard
| 2023-10-10 |
-
- 🔗
-
+ ✓
|
@@ -2439,9 +2719,7 @@ Leaderboard
| 2023-10-10 |
-
- 🔗
-
+ ✓
|
diff --git a/template/data.json b/template/data.json
index f0a2e1b..2c811a0 100644
--- a/template/data.json
+++ b/template/data.json
@@ -8,8 +8,8 @@
"folder": "20240820_honeycomb",
"resolved": 22.06,
"date": "2024-08-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240820_honeycomb/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240820_honeycomb/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://honeycomb.sh/",
"verified": false,
"oss": false
@@ -19,8 +19,8 @@
"folder": "20240721_amazon-q-developer-agent-20240719-dev",
"resolved": 19.75,
"date": "2024-07-21",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240721_amazon-q-developer-agent-20240719-dev/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240721_amazon-q-developer-agent-20240719-dev/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
@@ -30,8 +30,8 @@
"folder": "20240617_factory_code_droid",
"resolved": 19.27,
"date": "2024-06-17",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240617_factory_code_droid/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://www.factory.ai/",
"verified": false,
"oss": false
@@ -41,8 +41,8 @@
"folder": "20240628_autocoderover-v20240620",
"resolved": 18.83,
"date": "2024-06-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240628_autocoderover-v20240620/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://autocoderover.dev/",
"verified": false,
"oss": false
@@ -52,8 +52,8 @@
"folder": "20240620_sweagent_claude3.5sonnet",
"resolved": 18.13,
"date": "2024-06-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240620_sweagent_claude3.5sonnet/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240620_sweagent_claude3.5sonnet/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
@@ -63,8 +63,8 @@
"folder": "20240615_appmap-navie_gpt4o",
"resolved": 14.6,
"date": "2024-06-15",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240615_appmap-navie_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://appmap.io/navie",
"verified": true,
"oss": true
@@ -74,8 +74,8 @@
"folder": "20240509_amazon-q-developer-agent-20240430-dev",
"resolved": 13.82,
"date": "2024-05-09",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240509_amazon-q-developer-agent-20240430-dev/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
@@ -85,8 +85,8 @@
"folder": "20240402_sweagent_gpt4",
"resolved": 12.47,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_gpt4/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -96,8 +96,8 @@
"folder": "20240728_sweagent_gpt4o",
"resolved": 11.99,
"date": "2024-07-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240728_sweagent_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240728_sweagent_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -107,8 +107,8 @@
"folder": "20240402_sweagent_claude3opus",
"resolved": 10.51,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_sweagent_claude3opus/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
@@ -118,8 +118,8 @@
"folder": "20240402_rag_claude3opus",
"resolved": 3.79,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_claude3opus/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/princeton-nlp/SWE-bench/tree/main/swebench/inference",
"verified": true,
"oss": true
@@ -129,8 +129,8 @@
"folder": "20231010_rag_claude2",
"resolved": 1.96,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_claude2/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -140,8 +140,8 @@
"folder": "20240402_rag_gpt4",
"resolved": 1.31,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20240402_rag_gpt4/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -151,8 +151,8 @@
"folder": "20231010_rag_swellama13b",
"resolved": 0.7,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama13b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -162,8 +162,8 @@
"folder": "20231010_rag_swellama7b",
"resolved": 0.7,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_swellama7b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -173,8 +173,8 @@
"folder": "20231010_rag_gpt35",
"resolved": 0.17,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/test/20231010_rag_gpt35/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -184,35 +184,90 @@
{
"name": "Verified",
"results": [
+ {
+ "name": "Tools + Claude 3.5 Sonnet (2024-10-22)",
+ "folder": "20241022_tools_claude-3-5-sonnet-updated",
+ "resolved": 49.0,
+ "date": "2024-10-22",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.anthropic.com/",
+ "verified": false,
+ "oss": false
+ },
+ {
+ "name": "Solver (2024-09-12)",
+ "folder": "20240924_solver",
+ "resolved": 45.4,
+ "date": "2024-09-24",
+ "logs": true,
+ "trajs": true,
+ "site": "https://laredolabs.com/",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Gru(2024-08-24)",
"folder": "20240824_gru",
"resolved": 45.2,
"date": "2024-08-24",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240824_gru/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240824_gru/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://gru.ai",
"verified": false,
"oss": false
},
+ {
+ "name": "Solver (2024-09-12)",
+ "folder": "20240920_solver",
+ "resolved": 43.6,
+ "date": "2024-09-20",
+ "logs": true,
+ "trajs": true,
+ "site": "https://laredolabs.com/",
+ "verified": false,
+ "oss": false
+ },
+ {
+ "name": "Tools + Claude 3.5 Haiku",
+ "folder": "20241022_tools_claude-3-5-haiku",
+ "resolved": 40.6,
+ "date": "2024-10-22",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.anthropic.com/",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Honeycomb",
"folder": "20240820_honeycomb",
"resolved": 40.6,
"date": "2024-08-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240820_honeycomb/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240820_honeycomb/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://honeycomb.sh/",
"verified": false,
"oss": false
},
+ {
+ "name": "Composio SWEkit + Claude 3.5 Sonnet (2024-10-16)",
+ "folder": "20241016_composio_swekit",
+ "resolved": 40.6,
+ "date": "2024-10-16",
+ "logs": true,
+ "trajs": true,
+ "site": "https://github.com/ComposioHQ/composio/tree/master/python/swe/agent",
+ "verified": false,
+ "oss": true
+ },
{
"name": "Amazon Q Developer Agent (v20240719-dev)",
"folder": "20240721_amazon-q-developer-agent-20240719-dev",
"resolved": 38.8,
"date": "2024-07-21",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240721_amazon-q-developer-agent-20240719-dev/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240721_amazon-q-developer-agent-20240719-dev/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
@@ -222,8 +277,8 @@
"folder": "20240628_autocoderover-v20240620",
"resolved": 38.4,
"date": "2024-06-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240628_autocoderover-v20240620/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://autocoderover.dev/",
"verified": false,
"oss": false
@@ -233,8 +288,8 @@
"folder": "20240617_factory_code_droid",
"resolved": 37.0,
"date": "2024-06-17",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240617_factory_code_droid/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://www.factory.ai/",
"verified": false,
"oss": false
@@ -244,41 +299,96 @@
"folder": "20240620_sweagent_claude3.5sonnet",
"resolved": 33.6,
"date": "2024-06-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240620_sweagent_claude3.5sonnet/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240620_sweagent_claude3.5sonnet/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
},
+ {
+ "name": "nFactorial (2024-10-07)",
+ "folder": "20241007_nfactorial",
+ "resolved": 31.6,
+ "date": "2024-10-07",
+ "logs": true,
+ "trajs": true,
+ "site": "https://nfactorial.dev/",
+ "verified": false,
+ "oss": false
+ },
+ {
+ "name": "Lingma Agent + Lingma SWE-GPT 72b (v0925)",
+ "folder": "20241002_lingma-agent_lingma-swe-gpt-72b",
+ "resolved": 28.8,
+ "date": "2024-10-02",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT (https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT-v20240925)",
+ "verified": false,
+ "oss": true
+ },
+ {
+ "name": "EPAM AI/Run Developer Agent + GPT4o",
+ "folder": "20241016_epam-ai-run-gpt-4o",
+ "resolved": 27.0,
+ "date": "2024-10-16",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.epam.com/services/artificial-intelligence",
+ "verified": false,
+ "oss": false
+ },
{
"name": "AppMap Navie + GPT 4o (2024-05-13)",
"folder": "20240615_appmap-navie_gpt4o",
"resolved": 26.2,
"date": "2024-06-15",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240615_appmap-navie_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://appmap.io/navie",
"verified": true,
"oss": true
},
+ {
+ "name": "nFactorial (2024-10-01)",
+ "folder": "20241001_nfactorial",
+ "resolved": 25.8,
+ "date": "2024-10-01",
+ "logs": true,
+ "trajs": true,
+ "site": "https://nfactorial.dev/",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Amazon Q Developer Agent (v20240430-dev)",
"folder": "20240509_amazon-q-developer-agent-20240430-dev",
"resolved": 25.6,
"date": "2024-05-09",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240509_amazon-q-developer-agent-20240430-dev/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
},
+ {
+ "name": "Lingma Agent + Lingma SWE-GPT 72b (v0918)",
+ "folder": "20240918_lingma-agent_lingma-swe-gpt-72b",
+ "resolved": 25.0,
+ "date": "2024-09-18",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT",
+ "verified": false,
+ "oss": true
+ },
{
"name": "EPAM AI/Run Developer Agent + GPT4o",
"folder": "20240820_epam-ai-run-gpt-4o",
"resolved": 24.0,
"date": "2024-08-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240820_epam-ai-run-gpt-4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240820_epam-ai-run-gpt-4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://www.epam.com/services/artificial-intelligence",
"verified": false,
"oss": false
@@ -288,8 +398,8 @@
"folder": "20240728_sweagent_gpt4o",
"resolved": 23.2,
"date": "2024-07-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240728_sweagent_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240728_sweagent_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -299,8 +409,8 @@
"folder": "20240402_sweagent_gpt4",
"resolved": 22.4,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_sweagent_gpt4/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_sweagent_gpt4/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -310,19 +420,41 @@
"folder": "20240402_sweagent_claude3opus",
"resolved": 18.2,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_sweagent_claude3opus/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_sweagent_claude3opus/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
},
+ {
+ "name": "Lingma Agent + Lingma SWE-GPT 7b (v0925)",
+ "folder": "20241002_lingma-agent_lingma-swe-gpt-7b",
+ "resolved": 18.2,
+ "date": "2024-10-02",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT (https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT-v20240925)",
+ "verified": false,
+ "oss": true
+ },
+ {
+ "name": "Lingma Agent + Lingma SWE-GPT 7b (v0918)",
+ "folder": "20240918_lingma-agent_lingma-swe-gpt-7b",
+ "resolved": 10.2,
+ "date": "2024-09-18",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.modelscope.cn/models/yingwei/Lingma-SWE-GPT",
+ "verified": false,
+ "oss": true
+ },
{
"name": "RAG + Claude 3 Opus",
"folder": "20240402_rag_claude3opus",
"resolved": 7.0,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_rag_claude3opus/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/princeton-nlp/SWE-bench/tree/main/swebench/inference",
"verified": true,
"oss": true
@@ -332,8 +464,8 @@
"folder": "20231010_rag_claude2",
"resolved": 4.4,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20231010_rag_claude2/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -343,8 +475,8 @@
"folder": "20240402_rag_gpt4",
"resolved": 2.8,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20240402_rag_gpt4/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -354,8 +486,8 @@
"folder": "20231010_rag_swellama7b",
"resolved": 1.4,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20231010_rag_swellama7b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -365,8 +497,8 @@
"folder": "20231010_rag_swellama13b",
"resolved": 1.2,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20231010_rag_swellama13b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -376,8 +508,8 @@
"folder": "20231010_rag_gpt35",
"resolved": 0.4,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/verified/20231010_rag_gpt35/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -392,19 +524,30 @@
"folder": "20240702_codestory_aide_mixed",
"resolved": 43.0,
"date": "2024-07-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240702_codestory_aide_mixed/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://aide.dev/",
"verified": false,
"oss": false
},
+ {
+ "name": "Bytedance MarsCode Agent",
+ "folder": "20240912_marscode-agent-dev",
+ "resolved": 39.33,
+ "date": "2024-09-12",
+ "logs": true,
+ "trajs": true,
+ "site": "https://www.marscode.com/",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Honeycomb",
"folder": "20240820_honeycomb",
"resolved": 38.33,
"date": "2024-08-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240820_honeycomb/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240820_honeycomb/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://honeycomb.sh/",
"verified": false,
"oss": false
@@ -414,8 +557,8 @@
"folder": "20240627_abanteai_mentatbot_gpt4o",
"resolved": 38.0,
"date": "2024-06-27",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240627_abanteai_mentatbot_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://mentat.ai/blog/mentatbot-sota-coding-agent",
"verified": false,
"oss": false
@@ -425,8 +568,8 @@
"folder": "20240811_gru",
"resolved": 35.67,
"date": "2024-08-11",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240811_gru/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240811_gru/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://gru.ai",
"verified": false,
"oss": false
@@ -436,8 +579,8 @@
"folder": "20240829_Isoform",
"resolved": 35.0,
"date": "2024-08-29",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240829_Isoform/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240829_Isoform/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://isoform.ai",
"verified": false,
"oss": false
@@ -447,8 +590,8 @@
"folder": "20240806_SuperCoder2.0",
"resolved": 34.0,
"date": "2024-08-06",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240806_SuperCoder2.0/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240806_SuperCoder2.0/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://superagi.com/supercoder/",
"verified": false,
"oss": false
@@ -458,8 +601,8 @@
"folder": "20240723_marscode-agent-dev",
"resolved": 34.0,
"date": "2024-07-23",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240723_marscode-agent-dev/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://www.marscode.com/",
"verified": false,
"oss": false
@@ -469,8 +612,8 @@
"folder": "20240622_Lingma_Agent",
"resolved": 33.0,
"date": "2024-06-22",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240622_Lingma_Agent/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240622_Lingma_Agent/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://arxiv.org/abs/2406.01422",
"verified": false,
"oss": false
@@ -480,8 +623,8 @@
"folder": "20240617_factory_code_droid",
"resolved": 31.33,
"date": "2024-06-17",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240617_factory_code_droid/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://www.factory.ai/",
"verified": false,
"oss": false
@@ -491,19 +634,30 @@
"folder": "20240621_autocoderover-v20240620",
"resolved": 30.67,
"date": "2024-06-21",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240621_autocoderover-v20240620/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240621_autocoderover-v20240620/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://autocoderover.dev/",
"verified": false,
"oss": true
},
+ {
+ "name": "AIGCode Infant-Coder(2024-08-30)",
+ "folder": "20240908_infant_gpt4o",
+ "resolved": 30.0,
+ "date": "2024-09-08",
+ "logs": true,
+ "trajs": true,
+ "site": "https://aigcode.net/",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Amazon Q Developer Agent (v20240719-dev)",
"folder": "20240721_amazon-q-developer-agent-20240719-dev",
"resolved": 29.67,
"date": "2024-07-21",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240721_amazon-q-developer-agent-20240719-dev/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240721_amazon-q-developer-agent-20240719-dev/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
@@ -513,8 +667,8 @@
"folder": "20240808_RepoGraph_gpt4o",
"resolved": 29.67,
"date": "2024-08-08",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240808_RepoGraph_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240808_RepoGraph_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/ozyyshr/RepoGraph",
"verified": false,
"oss": true
@@ -524,8 +678,8 @@
"folder": "20240604_CodeR",
"resolved": 28.33,
"date": "2024-06-04",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240604_CodeR/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/NL2Code/CodeR",
"verified": false,
"oss": false
@@ -535,8 +689,8 @@
"folder": "20240612_MASAI_gpt4o",
"resolved": 28.0,
"date": "2024-06-12",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240612_MASAI_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/masai-dev-agent/masai",
"verified": false,
"oss": false
@@ -546,8 +700,8 @@
"folder": "20240706_sima_gpt4o",
"resolved": 27.67,
"date": "2024-07-06",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240706_sima_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240706_sima_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240706_sima_gpt4o",
"verified": false,
"oss": false
@@ -557,8 +711,8 @@
"folder": "20240630_agentless_gpt4o",
"resolved": 27.33,
"date": "2024-06-30",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240630_agentless_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/OpenAutoCoder/Agentless",
"verified": false,
"oss": true
@@ -568,8 +722,8 @@
"folder": "20240623_moatless_claude35sonnet",
"resolved": 26.67,
"date": "2024-06-23",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240623_moatless_claude35sonnet/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240623_moatless_claude35sonnet/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/aorwall/moatless-tools",
"verified": true,
"oss": true
@@ -579,8 +733,8 @@
"folder": "20240725_opendevin_codeact_v1.8_claude35sonnet",
"resolved": 26.67,
"date": "2024-07-25",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240725_opendevin_codeact_v1.8_claude35sonnet/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240725_opendevin_codeact_v1.8_claude35sonnet/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://docs.all-hands.dev/",
"verified": true,
"oss": true
@@ -590,8 +744,8 @@
"folder": "20240612_IBM_Research_Agent101",
"resolved": 26.67,
"date": "2024-06-12",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240612_IBM_Research_Agent101/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240612_IBM_Research_Agent101",
"verified": false,
"oss": false
@@ -601,30 +755,52 @@
"folder": "20240523_aider",
"resolved": 26.33,
"date": "2024-05-23",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240523_aider/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/paul-gauthier/aider",
"verified": false,
"oss": true
},
+ {
+ "name": "HyperAgent",
+ "folder": "20240925_hyperagent_lite1",
+ "resolved": 25.33,
+ "date": "2024-09-25",
+ "logs": true,
+ "trajs": true,
+ "site": "https://arxiv.org/abs/2409.16299",
+ "verified": false,
+ "oss": false
+ },
{
"name": "Moatless Tools + GPT 4o (2024-05-13)",
"folder": "20240617_moatless_gpt4o",
"resolved": 24.67,
"date": "2024-06-17",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240617_moatless_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240617_moatless_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/aorwall/moatless-tools",
"verified": true,
"oss": true
},
+ {
+ "name": "IBM SWE-1.0 (with open LLMs)",
+ "folder": "20241016_IBM-SWE-1.0",
+ "resolved": 23.67,
+ "date": "2024-10-16",
+ "logs": true,
+ "trajs": true,
+ "site": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20241016_IBM-SWE-1.0",
+ "verified": false,
+ "oss": false
+ },
{
"name": "OpenCSG StarShip CodeGenAgent + GPT 4 (0613)",
"folder": "20240524_opencsg_starship_gpt4",
"resolved": 23.67,
"date": "2024-05-24",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240524_opencsg_starship_gpt4/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://opencsg.com/product?class=StarShip",
"verified": false,
"oss": false
@@ -634,8 +810,8 @@
"folder": "20240620_sweagent_claude3.5sonnet",
"resolved": 23.0,
"date": "2024-06-20",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240620_sweagent_claude3.5sonnet/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240620_sweagent_claude3.5sonnet/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
@@ -645,8 +821,8 @@
"folder": "20240615_appmap-navie_gpt4o",
"resolved": 21.67,
"date": "2024-06-15",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240615_appmap-navie_gpt4o/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://appmap.io/navie",
"verified": true,
"oss": true
@@ -656,8 +832,8 @@
"folder": "20240828_autose_mixed",
"resolved": 21.67,
"date": "2024-08-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240828_autose_mixed/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240828_autose_mixed/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": false,
"oss": false
@@ -667,8 +843,8 @@
"folder": "20240509_amazon-q-developer-agent-20240430-dev",
"resolved": 20.33,
"date": "2024-05-09",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240509_amazon-q-developer-agent-20240430-dev/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://aws.amazon.com/q/developer/",
"verified": false,
"oss": false
@@ -678,8 +854,8 @@
"folder": "20240530_autocoderover-v20240408",
"resolved": 19.0,
"date": "2024-05-30",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240530_autocoderover-v20240408/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/nus-apr/auto-code-rover",
"verified": false,
"oss": true
@@ -689,8 +865,8 @@
"folder": "20240728_sweagent_gpt4o",
"resolved": 18.33,
"date": "2024-07-28",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240728_sweagent_gpt4o/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240728_sweagent_gpt4o/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -700,8 +876,8 @@
"folder": "20240402_sweagent_gpt4",
"resolved": 18.0,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_gpt4/trajs",
+ "logs": true,
+ "trajs": true,
"site": "https://github.com/princeton-nlp/SWE-agent",
"verified": true,
"oss": true
@@ -711,8 +887,8 @@
"folder": "20240402_sweagent_claude3opus",
"resolved": 11.67,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/logs",
- "trajs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_sweagent_claude3opus/trajs",
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
@@ -722,8 +898,8 @@
"folder": "20240402_rag_claude3opus",
"resolved": 4.33,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_claude3opus/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": "https://github.com/princeton-nlp/SWE-bench/tree/main/swebench/inference",
"verified": true,
"oss": true
@@ -733,8 +909,8 @@
"folder": "20231010_rag_claude2",
"resolved": 3.0,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_claude2/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": true,
"site": null,
"verified": true,
"oss": true
@@ -744,8 +920,8 @@
"folder": "20240402_rag_gpt4",
"resolved": 2.67,
"date": "2024-04-02",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20240402_rag_gpt4/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -755,8 +931,8 @@
"folder": "20231010_rag_swellama7b",
"resolved": 1.33,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama7b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -766,8 +942,8 @@
"folder": "20231010_rag_swellama13b",
"resolved": 1.0,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_swellama13b/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
@@ -777,8 +953,8 @@
"folder": "20231010_rag_gpt35",
"resolved": 0.33,
"date": "2023-10-10",
- "logs": "https://github.com/swe-bench/experiments/tree/main/evaluation/lite/20231010_rag_gpt35/logs",
- "trajs": null,
+ "logs": true,
+ "trajs": false,
"site": null,
"verified": true,
"oss": true
diff --git a/template/template_index.html b/template/template_index.html
index 00a26f2..a440605 100644
--- a/template/template_index.html
+++ b/template/template_index.html
@@ -169,16 +169,12 @@ Leaderboard
| {{item.date}} |
- {% if item.logs %}
- 🔗
- {% else %} - {% endif %}
+ {% if item.logs %}✓{% else %} - {% endif %}
|
- {% if item.trajs %}
- 🔗
- {% else %} - {% endif %}
+ {% if item.trajs %}✓{% else %} - {% endif %}
|
diff --git a/viewer.html b/viewer.html
index 6daed9a..620d138 100644
--- a/viewer.html
+++ b/viewer.html
@@ -115,18 +115,30 @@ SWE-bench Analysis
|