Skip to content

Commit dc50af5

Browse files
committed
Export EvoScientist Part2 runs to Home
1 parent 0f1f824 commit dc50af5

650 files changed

Lines changed: 122944 additions & 6 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

data/leaderboard.json

Lines changed: 245 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,30 @@
702702
"model": "gpt-5.4",
703703
"model_display": "GPT-5.4"
704704
},
705+
"Astronomy_001": {
706+
"score": 21.2,
707+
"run_id": "Astronomy_001_20260403_194127",
708+
"duration_seconds": 420,
709+
"cost_usd": null,
710+
"model": "",
711+
"model_display": ""
712+
},
713+
"Astronomy_002": {
714+
"score": 23.9,
715+
"run_id": "Astronomy_002_20260404_132455",
716+
"duration_seconds": 410,
717+
"cost_usd": null,
718+
"model": "",
719+
"model_display": ""
720+
},
721+
"Astronomy_003": {
722+
"score": 47.3,
723+
"run_id": "Astronomy_003_20260404_133145",
724+
"duration_seconds": 208,
725+
"cost_usd": null,
726+
"model": "",
727+
"model_display": ""
728+
},
705729
"Chemistry_000": {
706730
"score": 7.5,
707731
"run_id": "Chemistry_000_20260330_095531",
@@ -710,6 +734,30 @@
710734
"model": "gpt-5.4",
711735
"model_display": "GPT-5.4"
712736
},
737+
"Chemistry_001": {
738+
"score": 7.3,
739+
"run_id": "Chemistry_001_20260404_133513",
740+
"duration_seconds": 366,
741+
"cost_usd": null,
742+
"model": "",
743+
"model_display": ""
744+
},
745+
"Chemistry_002": {
746+
"score": 1.4,
747+
"run_id": "Chemistry_002_20260404_134119",
748+
"duration_seconds": 341,
749+
"cost_usd": null,
750+
"model": "",
751+
"model_display": ""
752+
},
753+
"Chemistry_003": {
754+
"score": 1.5,
755+
"run_id": "Chemistry_003_20260404_134700",
756+
"duration_seconds": 727,
757+
"cost_usd": null,
758+
"model": "",
759+
"model_display": ""
760+
},
713761
"Earth_000": {
714762
"score": 12.8,
715763
"run_id": "Earth_000_20260330_104037",
@@ -718,6 +766,30 @@
718766
"model": "gpt-5.4",
719767
"model_display": "GPT-5.4"
720768
},
769+
"Earth_001": {
770+
"score": 39.97,
771+
"run_id": "Earth_001_20260404_135907",
772+
"duration_seconds": 229,
773+
"cost_usd": null,
774+
"model": "",
775+
"model_display": ""
776+
},
777+
"Earth_002": {
778+
"score": 15.1,
779+
"run_id": "Earth_002_20260404_140256",
780+
"duration_seconds": 488,
781+
"cost_usd": null,
782+
"model": "",
783+
"model_display": ""
784+
},
785+
"Earth_003": {
786+
"score": 0.0,
787+
"run_id": "Earth_003_20260404_141105",
788+
"duration_seconds": 497,
789+
"cost_usd": null,
790+
"model": "",
791+
"model_display": ""
792+
},
721793
"Energy_000": {
722794
"score": 12.4,
723795
"run_id": "Energy_000_20260330_104517",
@@ -726,6 +798,30 @@
726798
"model": "gpt-5.4",
727799
"model_display": "GPT-5.4"
728800
},
801+
"Energy_001": {
802+
"score": 20.9,
803+
"run_id": "Energy_001_20260404_141923",
804+
"duration_seconds": 404,
805+
"cost_usd": null,
806+
"model": "",
807+
"model_display": ""
808+
},
809+
"Energy_002": {
810+
"score": 36.4,
811+
"run_id": "Energy_002_20260404_142606",
812+
"duration_seconds": 304,
813+
"cost_usd": null,
814+
"model": "",
815+
"model_display": ""
816+
},
817+
"Energy_003": {
818+
"score": 26.3,
819+
"run_id": "Energy_003_20260404_143111",
820+
"duration_seconds": 336,
821+
"cost_usd": null,
822+
"model": "",
823+
"model_display": ""
824+
},
729825
"Information_000": {
730826
"score": 7.2,
731827
"run_id": "Information_000_20260330_111006",
@@ -734,6 +830,30 @@
734830
"model": "gpt-5.4",
735831
"model_display": "GPT-5.4"
736832
},
833+
"Information_001": {
834+
"score": 3.6,
835+
"run_id": "Information_001_20260404_143647",
836+
"duration_seconds": 547,
837+
"cost_usd": null,
838+
"model": "",
839+
"model_display": ""
840+
},
841+
"Information_002": {
842+
"score": 14.1,
843+
"run_id": "Information_002_20260404_144554",
844+
"duration_seconds": 311,
845+
"cost_usd": null,
846+
"model": "",
847+
"model_display": ""
848+
},
849+
"Information_003": {
850+
"score": 4.75,
851+
"run_id": "Information_003_20260404_145105",
852+
"duration_seconds": 1406,
853+
"cost_usd": null,
854+
"model": "",
855+
"model_display": ""
856+
},
737857
"Life_000": {
738858
"score": 7.05,
739859
"run_id": "Life_000_20260330_111707",
@@ -742,6 +862,30 @@
742862
"model": "gpt-5.4",
743863
"model_display": "GPT-5.4"
744864
},
865+
"Life_001": {
866+
"score": 13.6,
867+
"run_id": "Life_001_20260404_151432",
868+
"duration_seconds": 333,
869+
"cost_usd": null,
870+
"model": "",
871+
"model_display": ""
872+
},
873+
"Life_002": {
874+
"score": 9.1,
875+
"run_id": "Life_002_20260404_152005",
876+
"duration_seconds": 206,
877+
"cost_usd": null,
878+
"model": "",
879+
"model_display": ""
880+
},
881+
"Life_003": {
882+
"score": 36.0,
883+
"run_id": "Life_003_20260404_152331",
884+
"duration_seconds": 388,
885+
"cost_usd": null,
886+
"model": "",
887+
"model_display": ""
888+
},
745889
"Material_000": {
746890
"score": 17.5,
747891
"run_id": "Material_000_20260330_112849",
@@ -750,6 +894,30 @@
750894
"model": "gpt-5.4",
751895
"model_display": "GPT-5.4"
752896
},
897+
"Material_001": {
898+
"score": 6.3,
899+
"run_id": "Material_001_20260404_152959",
900+
"duration_seconds": 323,
901+
"cost_usd": null,
902+
"model": "",
903+
"model_display": ""
904+
},
905+
"Material_002": {
906+
"score": 10.05,
907+
"run_id": "Material_002_20260404_153523",
908+
"duration_seconds": 239,
909+
"cost_usd": null,
910+
"model": "",
911+
"model_display": ""
912+
},
913+
"Material_003": {
914+
"score": 19.95,
915+
"run_id": "Material_003_20260404_153922",
916+
"duration_seconds": 977,
917+
"cost_usd": null,
918+
"model": "",
919+
"model_display": ""
920+
},
753921
"Math_000": {
754922
"score": 8.0,
755923
"run_id": "Math_000_20260330_120257",
@@ -758,6 +926,30 @@
758926
"model": "gpt-5.4",
759927
"model_display": "GPT-5.4"
760928
},
929+
"Math_001": {
930+
"score": 35.2,
931+
"run_id": "Math_001_20260404_155539",
932+
"duration_seconds": 416,
933+
"cost_usd": null,
934+
"model": "",
935+
"model_display": ""
936+
},
937+
"Math_002": {
938+
"score": 14.1,
939+
"run_id": "Math_002_20260404_160235",
940+
"duration_seconds": 588,
941+
"cost_usd": null,
942+
"model": "",
943+
"model_display": ""
944+
},
945+
"Math_003": {
946+
"score": 0.0,
947+
"run_id": "Math_003_20260404_161228",
948+
"duration_seconds": 341,
949+
"cost_usd": null,
950+
"model": "",
951+
"model_display": ""
952+
},
761953
"Neuroscience_000": {
762954
"score": 13.6,
763955
"run_id": "Neuroscience_000_20260330_121229",
@@ -766,13 +958,61 @@
766958
"model": "gpt-5.4",
767959
"model_display": "GPT-5.4"
768960
},
961+
"Neuroscience_001": {
962+
"score": 0.75,
963+
"run_id": "Neuroscience_001_20260404_161809",
964+
"duration_seconds": 478,
965+
"cost_usd": null,
966+
"model": "",
967+
"model_display": ""
968+
},
969+
"Neuroscience_002": {
970+
"score": 1.0,
971+
"run_id": "Neuroscience_002_20260404_162607",
972+
"duration_seconds": 2282,
973+
"cost_usd": null,
974+
"model": "",
975+
"model_display": ""
976+
},
977+
"Neuroscience_003": {
978+
"score": 5.95,
979+
"run_id": "Neuroscience_003_20260404_170410",
980+
"duration_seconds": 625,
981+
"cost_usd": null,
982+
"model": "",
983+
"model_display": ""
984+
},
769985
"Physics_000": {
770986
"score": 19.2,
771987
"run_id": "Physics_000_20260330_122122",
772988
"duration_seconds": 390,
773989
"cost_usd": 0.975,
774990
"model": "gpt-5.4",
775991
"model_display": "GPT-5.4"
992+
},
993+
"Physics_001": {
994+
"score": 24.5,
995+
"run_id": "Physics_001_20260404_171435",
996+
"duration_seconds": 354,
997+
"cost_usd": null,
998+
"model": "",
999+
"model_display": ""
1000+
},
1001+
"Physics_002": {
1002+
"score": 24.85,
1003+
"run_id": "Physics_002_20260404_172029",
1004+
"duration_seconds": 794,
1005+
"cost_usd": null,
1006+
"model": "",
1007+
"model_display": ""
1008+
},
1009+
"Physics_003": {
1010+
"score": 36.8,
1011+
"run_id": "Physics_003_20260404_173350",
1012+
"duration_seconds": 321,
1013+
"cost_usd": null,
1014+
"model": "",
1015+
"model_display": ""
7761016
}
7771017
},
7781018
"Nanobot": {
@@ -1423,11 +1663,11 @@
14231663
"frontier": {
14241664
"Astronomy_000": 29.4,
14251665
"Astronomy_001": 36.0,
1426-
"Astronomy_002": 23.5,
1666+
"Astronomy_002": 23.9,
14271667
"Astronomy_003": 47.3,
14281668
"Chemistry_000": 16.85,
1429-
"Chemistry_001": 6.2,
1430-
"Chemistry_002": 1.0,
1669+
"Chemistry_001": 7.3,
1670+
"Chemistry_002": 1.4,
14311671
"Chemistry_003": 18.0,
14321672
"Earth_000": 25.3,
14331673
"Earth_001": 40.87,
@@ -1436,7 +1676,7 @@
14361676
"Energy_000": 16.0,
14371677
"Energy_001": 24.5,
14381678
"Energy_002": 38.6,
1439-
"Energy_003": 25.5,
1679+
"Energy_003": 26.3,
14401680
"Information_000": 48.0,
14411681
"Information_001": 7.0,
14421682
"Information_002": 36.8,
@@ -1451,7 +1691,7 @@
14511691
"Material_003": 28.8,
14521692
"Math_000": 26.65,
14531693
"Math_001": 44.1,
1454-
"Math_002": 10.2,
1694+
"Math_002": 14.1,
14551695
"Math_003": 29.6,
14561696
"Neuroscience_000": 14.0,
14571697
"Neuroscience_001": 4.95,

0 commit comments

Comments
 (0)