[
  {
    "title": "Accuracy",
    "header": [
      {
        "value": "Model/adapter",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "NarrativeQA - F1",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "F1",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - F1",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "F1",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - F1",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nF1: Average F1 score in terms of word overlap between the model output and correct reference.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "F1",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "OpenbookQA - EM",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "MMLU - EM",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nExact match: Fraction of instances that the predicted output matches a correct reference exactly.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MATH - Equivalent (CoT)",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nEquivalent (CoT): Fraction of model outputs that are mathematically equivalent to the correct reference when using chain-of-thought prompting.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "Equivalent (CoT)",
          "run_group": "MATH"
        }
      },
      {
        "value": "GSM8K - EM",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nExact match (final number): Fraction of instances that the predicted output matches a correct reference exactly, ignoring text preceding the specified indicator.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "LegalBench - EM",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "MedQA - EM",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nQuasi-exact match: Fraction of instances that the predicted output matches a correct reference up to light processing.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "EM",
          "run_group": "MedQA"
        }
      },
      {
        "value": "WMT 2014 - BLEU-4",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nBLEU-4: Average BLEU score [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) based on 4-gram overlap.",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {
          "metric": "BLEU-4",
          "run_group": "WMT 2014"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Phi-3 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5587745587745587,
          "markdown": false
        },
        {
          "value": 0.7240664860594255,
          "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.7285957619003083,
          "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.2784356161051305,
          "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.916,
          "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.6750526315789473,
          "description": "min=0.48, mean=0.675, max=0.94, sum=3.375 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.6109688353262405,
          "description": "min=0.462, mean=0.611, max=0.7, sum=4.277 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.878,
          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.5931701729996458,
          "description": "min=0.365, mean=0.593, max=0.811, sum=2.966 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.6958250497017893,
          "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.16999895026248463,
          "description": "min=0.086, mean=0.17, max=0.218, sum=0.85 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        }
      ],
      [
        {
          "value": "Phi-3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5242165242165242,
          "markdown": false
        },
        {
          "value": 0.7539896438820357,
          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.674750712045289,
          "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.32368708044862116,
          "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.912,
          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.6592982456140352,
          "description": "min=0.44, mean=0.659, max=0.95, sum=3.296 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.7031213992046306,
          "description": "min=0.538, mean=0.703, max=0.933, sum=4.922 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "value": 0.583917247575927,
          "description": "min=0.395, mean=0.584, max=0.895, sum=2.92 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.6719681908548708,
          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.154384722250288,
          "description": "min=0.043, mean=0.154, max=0.205, sum=0.772 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        }
      ],
      [
        {
          "value": "DBRX Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.32553280053280054,
          "markdown": false
        },
        {
          "value": 0.48838701972031817,
          "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.5497946968436366,
          "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.2838740590464037,
          "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.91,
          "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.6430526315789474,
          "description": "min=0.34, mean=0.643, max=0.93, sum=3.215 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.35840470785391226,
          "description": "min=0.015, mean=0.358, max=0.553, sum=2.509 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.671,
          "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.4259173535239421,
          "description": "min=0.053, mean=0.426, max=0.755, sum=2.13 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.6938369781312127,
          "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.1311688767117412,
          "description": "min=0.035, mean=0.131, max=0.192, sum=0.656 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek LLM Chat (67B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5367132867132868,
          "markdown": false
        },
        {
          "value": 0.5810304555858785,
          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.7325504963344249,
          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.41210098854875066,
          "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.88,
          "description": "min=0.88, mean=0.88, max=0.88, sum=0.88 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.6405263157894737,
          "description": "min=0.44, mean=0.641, max=0.91, sum=3.203 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.6147909776428747,
          "description": "min=0.456, mean=0.615, max=0.748, sum=4.304 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.795,
          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.6365270638644099,
          "description": "min=0.45, mean=0.637, max=0.821, sum=3.183 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.6282306163021869,
          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.18631240946055874,
          "description": "min=0.11, mean=0.186, max=0.236, sum=0.932 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        }
      ],
      [
        {
          "value": "Falcon (40B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2399100899100899,
          "markdown": false
        },
        {
          "value": 0.6705103367078968,
          "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.6764905578489675,
          "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.39238346766141513,
          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.662,
          "description": "min=0.662, mean=0.662, max=0.662, sum=0.662 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.5069122807017543,
          "description": "min=0.31, mean=0.507, max=0.79, sum=2.535 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.12762334677879353,
          "description": "min=0.019, mean=0.128, max=0.228, sum=0.893 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.267,
          "description": "min=0.267, mean=0.267, max=0.267, sum=0.267 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.4418339946791853,
          "description": "min=0.204, mean=0.442, max=0.737, sum=2.209 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.4194831013916501,
          "description": "min=0.419, mean=0.419, max=0.419, sum=0.419 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.16187038401589685,
          "description": "min=0.017, mean=0.162, max=0.208, sum=0.809 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        }
      ],
      [
        {
          "value": "Falcon (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.07311022311022311,
          "markdown": false
        },
        {
          "value": 0.6210887417964561,
          "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.5804092414377209,
          "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.28519479870983677,
          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.26,
          "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.28814035087719303,
          "description": "min=0.17, mean=0.288, max=0.39, sum=1.441 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0437872907273152,
          "description": "min=0, mean=0.044, max=0.105, sum=0.307 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.055,
          "description": "min=0.055, mean=0.055, max=0.055, sum=0.055 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.346112465866886,
          "description": "min=0.12, mean=0.346, max=0.558, sum=1.731 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.2544731610337972,
          "description": "min=0.254, mean=0.254, max=0.254, sum=0.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.09410095284050815,
          "description": "min=0.0, mean=0.094, max=0.186, sum=0.471 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (27B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7265401265401266,
          "markdown": false
        },
        {
          "value": 0.7896535025615958,
          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.7313048410150741,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.3529084000159452,
          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.918,
          "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.6640701754385965,
          "description": "min=0.44, mean=0.664, max=0.93, sum=3.32 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.7455609307506493,
          "description": "min=0.513, mean=0.746, max=0.93, sum=5.219 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.812,
          "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.699800073753867,
          "description": "min=0.439, mean=0.7, max=0.979, sum=3.499 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.68389662027833,
          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.2144742673803493,
          "description": "min=0.167, mean=0.214, max=0.241, sum=1.072 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (9B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6167415917415917,
          "markdown": false
        },
        {
          "value": 0.7676649784011479,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.737982953197707,
          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.3278367961676789,
          "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.91,
          "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.6450526315789473,
          "description": "min=0.42, mean=0.645, max=0.91, sum=3.225 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.7244174090563319,
          "description": "min=0.635, mean=0.724, max=0.907, sum=5.071 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.762,
          "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 0.6386723496167432,
          "description": "min=0.395, mean=0.639, max=0.937, sum=3.193 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.6302186878727635,
          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.20060633485163287,
          "description": "min=0.155, mean=0.201, max=0.228, sum=1.003 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.37597402597402596,
          "markdown": false
        },
        {
          "value": 0.7516340764937092,
          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.6650827948992947,
          "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.3356376334853556,
          "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.808,
          "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5707368421052632,
          "description": "min=0.28, mean=0.571, max=0.87, sum=2.854 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.4998062389127261,
          "description": "min=0.3, mean=0.5, max=0.711, sum=3.499 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.559,
          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5808225733660738,
          "description": "min=0.379, mean=0.581, max=0.811, sum=2.904 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5129224652087475,
          "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.18740989970028615,
          "description": "min=0.137, mean=0.187, max=0.211, sum=0.937 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (13B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2597652347652348,
          "markdown": false
        },
        {
          "value": 0.7407788701111923,
          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.639984600932008,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.3706060978348202,
          "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.634,
          "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.5054035087719299,
          "description": "min=0.28, mean=0.505, max=0.84, sum=2.527 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.1021091538227401,
          "description": "min=0, mean=0.102, max=0.193, sum=0.715 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.266,
          "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.5910030736631379,
          "description": "min=0.338, mean=0.591, max=0.779, sum=2.955 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.39165009940357853,
          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.1672658654262144,
          "description": "min=0.074, mean=0.167, max=0.209, sum=0.836 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5215034965034965,
          "markdown": false
        },
        {
          "value": 0.7631713198687995,
          "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.6739523507154374,
          "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.4595602930170799,
          "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.838,
          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.5804561403508772,
          "description": "min=0.31, mean=0.58, max=0.92, sum=2.902 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.32279453852281265,
          "description": "min=0.205, mean=0.323, max=0.489, sum=2.26 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.567,
          "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.6726994442119312,
          "description": "min=0.444, mean=0.673, max=0.937, sum=3.363 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.6182902584493042,
          "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.19587598395260689,
          "description": "min=0.12, mean=0.196, max=0.233, sum=0.979 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.16997169497169498,
          "markdown": false
        },
        {
          "value": 0.685621004977857,
          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.61174518010822,
          "description": "min=0.612, mean=0.612, max=0.612, sum=0.612 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.33263163753268066,
          "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.544,
          "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.42491228070175435,
          "description": "min=0.27, mean=0.425, max=0.63, sum=2.125 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.09707261096489982,
          "description": "min=0.019, mean=0.097, max=0.198, sum=0.68 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.154,
          "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.5015588886580016,
          "description": "min=0.245, mean=0.502, max=0.747, sum=2.508 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.39165009940357853,
          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.14394034057731314,
          "description": "min=0.046, mean=0.144, max=0.189, sum=0.72 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8265234765234766,
          "markdown": false
        },
        {
          "value": 0.7982012117768263,
          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.7432651718882813,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.47528293189781273,
          "description": "min=0.475, mean=0.475, max=0.475, sum=0.475 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.934,
          "description": "min=0.934, mean=0.934, max=0.934, sum=0.934 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.6945964912280702,
          "description": "min=0.43, mean=0.695, max=0.94, sum=3.473 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.6629409689997205,
          "description": "min=0.433, mean=0.663, max=0.822, sum=4.641 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.805,
          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.7329205565490214,
          "description": "min=0.466, mean=0.733, max=0.958, sum=3.665 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.7773359840954275,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.2246351578099651,
          "description": "min=0.183, mean=0.225, max=0.259, sum=1.123 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (8B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.42783050283050283,
          "markdown": false
        },
        {
          "value": 0.7539233930711181,
          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.6806203798949644,
          "description": "min=0.681, mean=0.681, max=0.681, sum=0.681 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.37803680876088575,
          "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.766,
          "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.6015087719298245,
          "description": "min=0.33, mean=0.602, max=0.88, sum=3.008 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.39103657573669814,
          "description": "min=0.233, mean=0.391, max=0.496, sum=2.737 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.499,
          "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.637026652071986,
          "description": "min=0.417, mean=0.637, max=0.874, sum=3.185 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.5805168986083499,
          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.18308563052673504,
          "description": "min=0.133, mean=0.183, max=0.212, sum=0.915 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (405B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8890942390942391,
          "markdown": false
        },
        {
          "value": 0.7493835397091708,
          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.7558559882582095,
          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.45604816952502747,
          "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.94,
          "description": "min=0.94, mean=0.94, max=0.94, sum=0.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.7591228070175438,
          "description": "min=0.6, mean=0.759, max=0.94, sum=3.796 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.827009209567349,
          "description": "min=0.635, mean=0.827, max=0.97, sum=5.789 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.949,
          "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.7071305010287493,
          "description": "min=0.433, mean=0.707, max=0.979, sum=3.536 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.805168986083499,
          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.23813718625719624,
          "description": "min=0.2, mean=0.238, max=0.284, sum=1.191 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.848043623043623,
          "markdown": false
        },
        {
          "value": 0.7723707219827545,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.7384895831522981,
          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.4517358170841441,
          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.938,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.7090877192982457,
          "description": "min=0.55, mean=0.709, max=0.93, sum=3.545 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.7832717352913191,
          "description": "min=0.579, mean=0.783, max=0.97, sum=5.483 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.938,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.6866785900133753,
          "description": "min=0.439, mean=0.687, max=1, sum=3.433 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.7693836978131213,
          "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.2228443609742569,
          "description": "min=0.183, mean=0.223, max=0.265, sum=1.114 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (8B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.33544788544788545,
          "markdown": false
        },
        {
          "value": 0.7558634540169302,
          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.6771139490390586,
          "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.2089944707027018,
          "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.74,
          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.5001754385964912,
          "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.7028671111290451,
          "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.798,
          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.34203920837516133,
          "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.24453280318091453,
          "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.18133059362692966,
          "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (11B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.35864135864135865,
          "markdown": false
        },
        {
          "value": 0.7556933359362039,
          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.670634329606622,
          "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.23418121884173548,
          "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.724,
          "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.5109473684210526,
          "description": "min=0.28, mean=0.511, max=0.78, sum=2.555 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7393694681944376,
          "description": "min=0.579, mean=0.739, max=0.884, sum=5.176 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.823,
          "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.4349549264363712,
          "description": "min=0.018, mean=0.435, max=0.905, sum=2.175 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.27037773359840955,
          "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.17928882640651603,
          "description": "min=0.13, mean=0.179, max=0.217, sum=0.896 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (90B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8595654345654345,
          "markdown": false
        },
        {
          "value": 0.7768594454083262,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7386694111679715,
          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.45682333540724823,
          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.942,
          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7028421052631579,
          "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7914277345121897,
          "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.936,
          "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.6796099392115946,
          "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7693836978131213,
          "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.22413544739315405,
          "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "LLaMA (65B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.37845487845487846,
          "markdown": false
        },
        {
          "value": 0.7551554577325463,
          "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.6721751757066182,
          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.43256542520883445,
          "description": "min=0.433, mean=0.433, max=0.433, sum=0.433 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.754,
          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.5837192982456141,
          "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.25736959672088194,
          "description": "min=0.096, mean=0.257, max=0.474, sum=1.802 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.489,
          "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.4802675608835245,
          "description": "min=0.018, mean=0.48, max=0.863, sum=2.401 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.5069582504970179,
          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.1890398965463828,
          "description": "min=0.102, mean=0.189, max=0.239, sum=0.945 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        }
      ],
      [
        {
          "value": "Mistral Instruct v0.3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2208125208125208,
          "markdown": false
        },
        {
          "value": 0.716327555426559,
          "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.6796142755781713,
          "description": "min=0.68, mean=0.68, max=0.68, sum=0.68 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.2532549871958964,
          "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.79,
          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.5102105263157894,
          "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.2885163116497266,
          "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.538,
          "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 0.33102715605674365,
          "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.5168986083499006,
          "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.14246230691622103,
          "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        }
      ],
      [
        {
          "value": "Mistral v0.1 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3251998001998002,
          "markdown": false
        },
        {
          "value": 0.7164233362865015,
          "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.6869664212545938,
          "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.36699189732791715,
          "description": "min=0.367, mean=0.367, max=0.367, sum=0.367 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.776,
          "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.5835087719298246,
          "description": "min=0.31, mean=0.584, max=0.85, sum=2.918 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.2973668020179648,
          "description": "min=0.067, mean=0.297, max=0.43, sum=2.082 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.377,
          "description": "min=0.377, mean=0.377, max=0.377, sum=0.377 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.5802532274633645,
          "description": "min=0.433, mean=0.58, max=0.789, sum=2.901 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.5248508946322068,
          "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.1604456237859723,
          "description": "min=0.056, mean=0.16, max=0.201, sum=0.802 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x22B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7521478521478522,
          "markdown": false
        },
        {
          "value": 0.7787120510393413,
          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.7263120461221779,
          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.4775699105464498,
          "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.882,
          "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.7013333333333331,
          "description": "min=0.48, mean=0.701, max=0.95, sum=3.507 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.6555091515311833,
          "description": "min=0.5, mean=0.656, max=0.822, sum=4.589 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.8,
          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.7078762895951439,
          "description": "min=0.441, mean=0.708, max=0.968, sum=3.539 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.7037773359840954,
          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.20899039631747862,
          "description": "min=0.133, mean=0.209, max=0.243, sum=1.045 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x7B 32K seqlen)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5626456876456877,
          "markdown": false
        },
        {
          "value": 0.7666040431581972,
          "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.6991936086134782,
          "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.42725306310658473,
          "description": "min=0.427, mean=0.427, max=0.427, sum=0.427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.868,
          "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.6490526315789473,
          "description": "min=0.38, mean=0.649, max=0.93, sum=3.245 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.49413645423437347,
          "description": "min=0.289, mean=0.494, max=0.696, sum=3.459 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.622,
          "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.630020881709919,
          "description": "min=0.428, mean=0.63, max=0.853, sum=3.15 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.6520874751491054,
          "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.18982421560363932,
          "description": "min=0.099, mean=0.19, max=0.23, sum=0.949 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        }
      ],
      [
        {
          "value": "OLMo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.06027306027306027,
          "markdown": false
        },
        {
          "value": 0.597068912076024,
          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.6034502290883008,
          "description": "min=0.603, mean=0.603, max=0.603, sum=0.603 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.25857870485341333,
          "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.222,
          "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.30491228070175436,
          "description": "min=0.26, mean=0.305, max=0.38, sum=1.525 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.029284451438674204,
          "description": "min=0, mean=0.029, max=0.088, sum=0.205 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.044,
          "description": "min=0.044, mean=0.044, max=0.044, sum=0.044 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.3408965660550754,
          "description": "min=0.158, mean=0.341, max=0.6, sum=1.704 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.2286282306163022,
          "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.09737319820441259,
          "description": "min=0.009, mean=0.097, max=0.157, sum=0.487 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        }
      ],
      [
        {
          "value": "Phi-2",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.19139194139194138,
          "markdown": false
        },
        {
          "value": 0.7026273770474518,
          "description": "min=0.703, mean=0.703, max=0.703, sum=0.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.679955399008943,
          "description": "min=0.68, mean=0.68, max=0.68, sum=0.68 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.1551754873692229,
          "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.798,
          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.5184210526315789,
          "description": "min=0.31, mean=0.518, max=0.78, sum=2.592 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.2551509214299912,
          "description": "min=0.033, mean=0.255, max=0.465, sum=1.786 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.581,
          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.3344233190996174,
          "description": "min=0.137, mean=0.334, max=0.537, sum=1.672 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.4095427435387674,
          "description": "min=0.41, mean=0.41, max=0.41, sum=0.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.037894950541618386,
          "description": "min=0.0, mean=0.038, max=0.113, sum=0.189 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 Chat (110B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5983516483516483,
          "markdown": false
        },
        {
          "value": 0.7207089322288743,
          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.739454568054656,
          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.3498619896871144,
          "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.922,
          "description": "min=0.922, mean=0.922, max=0.922, sum=0.922 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.7040701754385965,
          "description": "min=0.57, mean=0.704, max=0.87, sum=3.52 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.5676674542647614,
          "description": "min=0.211, mean=0.568, max=0.769, sum=3.974 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.815,
          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 0.6241109416203021,
          "description": "min=0.387, mean=0.624, max=0.958, sum=3.121 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.6401590457256461,
          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.19237771714027324,
          "description": "min=0.133, mean=0.192, max=0.232, sum=0.962 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.473018648018648,
          "markdown": false
        },
        {
          "value": 0.7107698447689265,
          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.7721353322012677,
          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.30035971944775497,
          "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.862,
          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.626280701754386,
          "description": "min=0.4, mean=0.626, max=0.87, sum=3.131 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.6856483785492354,
          "description": "min=0.6, mean=0.686, max=0.8, sum=4.8 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.693,
          "description": "min=0.693, mean=0.693, max=0.693, sum=0.693 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.593260765576846,
          "description": "min=0.358, mean=0.593, max=0.853, sum=2.966 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.5149105367793241,
          "description": "min=0.515, mean=0.515, max=0.515, sum=0.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.17796029908839783,
          "description": "min=0.101, mean=0.178, max=0.23, sum=0.89 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (32B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5956210456210456,
          "markdown": false
        },
        {
          "value": 0.5889067819526655,
          "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.777128811592766,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.35338053596096847,
          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.932,
          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.628280701754386,
          "description": "min=0.4, mean=0.628, max=0.91, sum=3.141 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.7331117497580165,
          "description": "min=0.5, mean=0.733, max=0.859, sum=5.132 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.773,
          "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.6358504973995908,
          "description": "min=0.417, mean=0.636, max=0.926, sum=3.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.6560636182902585,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.19333209393951906,
          "description": "min=0.129, mean=0.193, max=0.242, sum=0.967 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6585414585414585,
          "markdown": false
        },
        {
          "value": 0.6014050903115206,
          "description": "min=0.601, mean=0.601, max=0.601, sum=0.601 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.7584906907350352,
          "description": "min=0.758, mean=0.758, max=0.758, sum=0.758 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.4171243347653772,
          "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.93,
          "description": "min=0.93, mean=0.93, max=0.93, sum=0.93 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.6467719298245613,
          "description": "min=0.44, mean=0.647, max=0.94, sum=3.234 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.6834005357261171,
          "description": "min=0.6, mean=0.683, max=0.763, sum=4.784 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.799,
          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.693858307114614,
          "description": "min=0.425, mean=0.694, max=0.958, sum=3.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.6699801192842942,
          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.20121654214700313,
          "description": "min=0.14, mean=0.201, max=0.255, sum=1.006 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3068098568098568,
          "markdown": false
        },
        {
          "value": 0.44816728370930614,
          "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.749199421021575,
          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.26957631562717405,
          "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.806,
          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.5694736842105262,
          "description": "min=0.39, mean=0.569, max=0.84, sum=2.847 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.5611864086772288,
          "description": "min=0.462, mean=0.561, max=0.726, sum=3.928 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.6,
          "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.5228154900681052,
          "description": "min=0.253, mean=0.523, max=0.716, sum=2.614 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.47912524850894633,
          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.15348235064738502,
          "description": "min=0.082, mean=0.153, max=0.19, sum=0.767 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        }
      ],
      [
        {
          "value": "Qwen2 Instruct (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8152847152847152,
          "markdown": false
        },
        {
          "value": 0.7271789738821858,
          "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.7758053578298147,
          "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.38969418493921437,
          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.954,
          "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.7693684210526316,
          "description": "min=0.65, mean=0.769, max=0.94, sum=3.847 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.790383457152123,
          "description": "min=0.605, mean=0.79, max=0.93, sum=5.533 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.92,
          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 0.7117073030961991,
          "description": "min=0.411, mean=0.712, max=0.947, sum=3.559 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.7455268389662028,
          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.20663596807636653,
          "description": "min=0.156, mean=0.207, max=0.255, sum=1.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7805860805860806,
          "markdown": false
        },
        {
          "value": 0.7447068231799974,
          "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.6759967901974433,
          "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.35935588577919453,
          "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.962,
          "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.7696140350877194,
          "description": "min=0.62, mean=0.77, max=0.96, sum=3.848 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.8839012159697595,
          "description": "min=0.763, mean=0.884, max=0.97, sum=6.187 (7)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.9,
          "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.7400811216441259,
          "description": "min=0.46, mean=0.74, max=0.979, sum=3.7 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.7534791252485089,
          "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.2066158935644188,
          "description": "min=0.153, mean=0.207, max=0.257, sum=1.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5388028638028638,
          "markdown": false
        },
        {
          "value": 0.7423703094487041,
          "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.7249336636507822,
          "description": "min=0.725, mean=0.725, max=0.725, sum=0.725 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.2050909593667918,
          "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.862,
          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.6580701754385965,
          "description": "min=0.49, mean=0.658, max=0.86, sum=3.29 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.8351277966333045,
          "description": "min=0.684, mean=0.835, max=0.963, sum=5.846 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.83,
          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.6322681591093342,
          "description": "min=0.414, mean=0.632, max=0.916, sum=3.161 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.6003976143141153,
          "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.15536989248644223,
          "description": "min=0.085, mean=0.155, max=0.204, sum=0.777 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Arctic Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.37957875457875456,
          "markdown": false
        },
        {
          "value": 0.6535175183610216,
          "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.5864529300238688,
          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.3897564598181023,
          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.828,
          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.5752280701754386,
          "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.5193910785710051,
          "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.768,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 0.5879736265537334,
          "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.5805168986083499,
          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.17207944013368096,
          "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        }
      ],
      [
        {
          "value": "Yi (34B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6171495171495172,
          "markdown": false
        },
        {
          "value": 0.7821540048974333,
          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.7751231950468775,
          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.4434678831574275,
          "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.92,
          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.6495438596491228,
          "description": "min=0.4, mean=0.65, max=0.91, sum=3.248 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.37478395850488877,
          "description": "min=0.167, mean=0.375, max=0.563, sum=2.623 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.648,
          "description": "min=0.648, mean=0.648, max=0.648, sum=0.648 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.6178875317917214,
          "description": "min=0.311, mean=0.618, max=0.8, sum=3.089 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.6560636182902585,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.1715816474552153,
          "description": "min=0.1, mean=0.172, max=0.218, sum=0.858 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        }
      ],
      [
        {
          "value": "Yi (6B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.28095238095238095,
          "markdown": false
        },
        {
          "value": 0.7017925106130677,
          "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.74781385828905,
          "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.30995982967860286,
          "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.8,
          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.5301754385964912,
          "description": "min=0.3, mean=0.53, max=0.87, sum=2.651 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.12585197340399298,
          "description": "min=0.058, mean=0.126, max=0.2, sum=0.881 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.375,
          "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.5188185110499097,
          "description": "min=0.284, mean=0.519, max=0.779, sum=2.594 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.4970178926441352,
          "description": "min=0.497, mean=0.497, max=0.497, sum=0.497 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.1168525423688862,
          "description": "min=0.055, mean=0.117, max=0.182, sum=0.584 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Grande (17B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.19242424242424241,
          "markdown": false
        },
        {
          "value": 0.7444818075617076,
          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.6266441944273605,
          "description": "min=0.627, mean=0.627, max=0.627, sum=0.627 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.34996616858056556,
          "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.614,
          "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.4709122807017544,
          "description": "min=0.25, mean=0.471, max=0.77, sum=2.355 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.063558867353237,
          "description": "min=0, mean=0.064, max=0.158, sum=0.445 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.159,
          "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.46762841806735606,
          "description": "min=0.199, mean=0.468, max=0.842, sum=2.338 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.38966202783300197,
          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.10181218471056237,
          "description": "min=0.021, mean=0.102, max=0.149, sum=0.509 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Jumbo (178B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.23987678987678987,
          "markdown": false
        },
        {
          "value": 0.7282089633491154,
          "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.6501271721229362,
          "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.38533436458666126,
          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.688,
          "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.48266666666666663,
          "description": "min=0.25, mean=0.483, max=0.83, sum=2.413 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.10289874457561117,
          "description": "min=0.033, mean=0.103, max=0.193, sum=0.72 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.239,
          "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.533179889193595,
          "description": "min=0.324, mean=0.533, max=0.821, sum=2.666 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.43141153081510936,
          "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.11441985640983483,
          "description": "min=0.044, mean=0.114, max=0.148, sum=0.572 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        }
      ],
      [
        {
          "value": "Jamba Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3216699966699967,
          "markdown": false
        },
        {
          "value": 0.6575850264683326,
          "description": "min=0.658, mean=0.658, max=0.658, sum=0.658 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.6360519186812247,
          "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.38361128423575375,
          "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.796,
          "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.5817192982456141,
          "description": "min=0.36, mean=0.582, max=0.91, sum=2.909 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.3804915787779925,
          "description": "min=0.237, mean=0.38, max=0.607, sum=2.663 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.67,
          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 0.5399763566175071,
          "description": "min=0.304, mean=0.54, max=0.874, sum=2.7 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.5188866799204771,
          "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.16401847474382014,
          "description": "min=0.099, mean=0.164, max=0.205, sum=0.656 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Mini",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.45825840825840825,
          "markdown": false
        },
        {
          "value": 0.7462485808592105,
          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.7102378053091456,
          "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.38787534071774205,
          "description": "min=0.388, mean=0.388, max=0.388, sum=0.388 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.89,
          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.5822456140350878,
          "description": "min=0.33, mean=0.582, max=0.9, sum=2.911 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.31810720898848194,
          "description": "min=0.233, mean=0.318, max=0.386, sum=2.227 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.691,
          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 0.5027929442133945,
          "description": "min=0.365, mean=0.503, max=0.842, sum=2.514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.6322067594433399,
          "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.17897759445629507,
          "description": "min=0.116, mean=0.179, max=0.21, sum=0.895 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Large",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6856809856809857,
          "markdown": false
        },
        {
          "value": 0.6635952388918296,
          "description": "min=0.664, mean=0.664, max=0.664, sum=0.664 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.7179355401122725,
          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.3936910411156207,
          "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.948,
          "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.6828070175438596,
          "description": "min=0.53, mean=0.683, max=0.92, sum=3.414 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.6917365716019327,
          "description": "min=0.481, mean=0.692, max=0.889, sum=4.842 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.846,
          "description": "min=0.846, mean=0.846, max=0.846, sum=0.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 0.6750865115298952,
          "description": "min=0.409, mean=0.675, max=0.989, sum=3.375 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.6978131212723658,
          "description": "min=0.698, mean=0.698, max=0.698, sum=0.698 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.20294190295008385,
          "description": "min=0.141, mean=0.203, max=0.246, sum=1.015 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        }
      ],
      [
        {
          "value": "Luminous Base (13B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.047435897435897434,
          "markdown": false
        },
        {
          "value": 0.6333079653434086,
          "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.5769072833940746,
          "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.1972226511424054,
          "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.286,
          "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.2433684210526316,
          "description": "min=0.22, mean=0.243, max=0.29, sum=1.217 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.026258799453413896,
          "description": "min=0, mean=0.026, max=0.067, sum=0.184 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.028,
          "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.33176688919652186,
          "description": "min=0.165, mean=0.332, max=0.601, sum=1.659 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.26043737574552683,
          "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.06610696472033581,
          "description": "min=0.0, mean=0.066, max=0.171, sum=0.331 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        }
      ],
      [
        {
          "value": "Luminous Extended (30B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.09043456543456543,
          "markdown": false
        },
        {
          "value": 0.6839493639230109,
          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.6108345391959515,
          "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.2529234321406317,
          "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.272,
          "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.24838596491228068,
          "description": "min=0.2, mean=0.248, max=0.31, sum=1.242 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.039711233958479975,
          "description": "min=0, mean=0.04, max=0.088, sum=0.278 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.075,
          "description": "min=0.075, mean=0.075, max=0.075, sum=0.075 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.42137461988954483,
          "description": "min=0.204, mean=0.421, max=0.632, sum=2.107 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.27634194831013914,
          "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.08290194756313038,
          "description": "min=0.0, mean=0.083, max=0.194, sum=0.415 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        }
      ],
      [
        {
          "value": "Luminous Supreme (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.16352813852813852,
          "markdown": false
        },
        {
          "value": 0.7432289200129168,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.6560865693464876,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.2992475548820738,
          "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.284,
          "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.31642105263157894,
          "description": "min=0.18, mean=0.316, max=0.5, sum=1.582 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0782924315360056,
          "description": "min=0.038, mean=0.078, max=0.158, sum=0.548 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.137,
          "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.45204109729364284,
          "description": "min=0.221, mean=0.452, max=0.768, sum=2.26 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.27634194831013914,
          "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.10240517647293663,
          "description": "min=0.0, mean=0.102, max=0.193, sum=0.512 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        }
      ],
      [
        {
          "value": "Claude v1.3",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.571936396936397,
          "markdown": false
        },
        {
          "value": 0.723150787350498,
          "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.6992205134559425,
          "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.40885082169503945,
          "description": "min=0.409, mean=0.409, max=0.409, sum=0.409 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.908,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.6310526315789474,
          "description": "min=0.35, mean=0.631, max=0.93, sum=3.155 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.5404430376278602,
          "description": "min=0.368, mean=0.54, max=0.826, sum=3.783 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.784,
          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.629311936712158,
          "description": "min=0.417, mean=0.629, max=0.916, sum=3.147 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.6182902584493042,
          "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.21852862176698085,
          "description": "min=0.152, mean=0.219, max=0.28, sum=1.093 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        }
      ],
      [
        {
          "value": "Claude Instant 1.2",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4455211455211455,
          "markdown": false
        },
        {
          "value": 0.61587810486012,
          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.7309513133438542,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.34322234970064514,
          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.844,
          "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.6308070175438597,
          "description": "min=0.37, mean=0.631, max=0.9, sum=3.154 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.49868392408172085,
          "description": "min=0.365, mean=0.499, max=0.704, sum=3.491 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.721,
          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.5862483550253602,
          "description": "min=0.341, mean=0.586, max=0.937, sum=2.931 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.558648111332008,
          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.19427315928341765,
          "description": "min=0.138, mean=0.194, max=0.24, sum=0.971 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.0",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5375624375624376,
          "markdown": false
        },
        {
          "value": 0.7176573220461364,
          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.67019677256217,
          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.4278073763812108,
          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.862,
          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.639298245614035,
          "description": "min=0.38, mean=0.639, max=0.9, sum=3.196 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.6027165409050355,
          "description": "min=0.491, mean=0.603, max=0.8, sum=4.219 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.583,
          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.6431955817921604,
          "description": "min=0.387, mean=0.643, max=0.947, sum=3.216 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.6520874751491054,
          "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.21900739878736455,
          "description": "min=0.159, mean=0.219, max=0.268, sum=1.095 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.48374958374958377,
          "markdown": false
        },
        {
          "value": 0.6770119068282102,
          "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.6107197144743313,
          "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.37511663259487643,
          "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.872,
          "description": "min=0.872, mean=0.872, max=0.872, sum=0.872 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.6432982456140351,
          "description": "min=0.4, mean=0.643, max=0.92, sum=3.216 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.6321468050966216,
          "description": "min=0.5, mean=0.632, max=0.852, sum=4.425 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.604,
          "description": "min=0.604, mean=0.604, max=0.604, sum=0.604 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.6428163107262121,
          "description": "min=0.406, mean=0.643, max=0.874, sum=3.214 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.6441351888667992,
          "description": "min=0.644, mean=0.644, max=0.644, sum=0.644 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.20428512286077702,
          "description": "min=0.148, mean=0.204, max=0.233, sum=1.021 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Haiku (20240307)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2942057942057942,
          "markdown": false
        },
        {
          "value": 0.24405544274356078,
          "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.2515438813518554,
          "description": "min=0.252, mean=0.252, max=0.252, sum=0.252 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.1436799865373074,
          "description": "min=0.144, mean=0.144, max=0.144, sum=0.144 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.838,
          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.6623157894736842,
          "description": "min=0.42, mean=0.662, max=0.95, sum=3.312 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.13086769298519604,
          "description": "min=0, mean=0.131, max=0.504, sum=0.916 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.699,
          "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.4601504450109314,
          "description": "min=0.034, mean=0.46, max=0.779, sum=2.301 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.7017892644135189,
          "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.147925817269585,
          "description": "min=0.018, mean=0.148, max=0.208, sum=0.74 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Sonnet (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.40884115884115885,
          "markdown": false
        },
        {
          "value": 0.11135505083090322,
          "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.07170782292313307,
          "description": "min=0.072, mean=0.072, max=0.072, sum=0.072 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.02834914415532957,
          "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.918,
          "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.6520701754385965,
          "description": "min=0.39, mean=0.652, max=0.94, sum=3.26 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.08437865683887716,
          "description": "min=0, mean=0.084, max=0.337, sum=0.591 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.907,
          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.4896911106103133,
          "description": "min=0.029, mean=0.49, max=0.958, sum=2.448 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.68389662027833,
          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.21816936052387548,
          "description": "min=0.169, mean=0.218, max=0.25, sum=1.091 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Opus (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7127372627372628,
          "markdown": false
        },
        {
          "value": 0.3514456352457142,
          "description": "min=0.351, mean=0.351, max=0.351, sum=0.351 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.2644558438495928,
          "description": "min=0.264, mean=0.264, max=0.264, sum=0.264 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.44052528844509653,
          "description": "min=0.441, mean=0.441, max=0.441, sum=0.441 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.956,
          "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.7678947368421053,
          "description": "min=0.6, mean=0.768, max=0.96, sum=3.839 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.7603378601542615,
          "description": "min=0.526, mean=0.76, max=0.889, sum=5.322 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.924,
          "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.6620640054788587,
          "description": "min=0.153, mean=0.662, max=0.989, sum=3.31 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.7753479125248509,
          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.2398990631069128,
          "description": "min=0.188, mean=0.24, max=0.285, sum=1.199 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.5 Sonnet (20240620)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9121711621711621,
          "markdown": false
        },
        {
          "value": 0.7462731245648014,
          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.7493178538118532,
          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.5015987604513761,
          "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.972,
          "description": "min=0.972, mean=0.972, max=0.972, sum=0.972 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.7994035087719298,
          "description": "min=0.59, mean=0.799, max=0.96, sum=3.997 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.8128304512025443,
          "description": "min=0.579, mean=0.813, max=0.953, sum=5.69 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.949,
          "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.7066311604234408,
          "description": "min=0.455, mean=0.707, max=0.968, sum=3.533 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.8250497017892644,
          "description": "min=0.825, mean=0.825, max=0.825, sum=0.825 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.22898372314831902,
          "description": "min=0.181, mean=0.229, max=0.27, sum=1.145 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        }
      ],
      [
        {
          "value": "Command",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3579087579087579,
          "markdown": false
        },
        {
          "value": 0.748757488257181,
          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 0.7768791442376555,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 0.39111384104746405,
          "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 0.774,
          "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 0.525157894736842,
          "description": "min=0.27, mean=0.525, max=0.88, sum=2.626 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 0.2359498178959623,
          "description": "min=0.1, mean=0.236, max=0.349, sum=1.652 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 0.452,
          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 0.5775312900780561,
          "description": "min=0.365, mean=0.578, max=0.884, sum=2.888 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 0.44532803180914515,
          "description": "min=0.445, mean=0.445, max=0.445, sum=0.445 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 0.0881786096035731,
          "description": "min=0.013, mean=0.088, max=0.151, sum=0.441 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        }
      ],
      [
        {
          "value": "Command Light",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.11803196803196803,
          "markdown": false
        },
        {
          "value": 0.6294598740957924,
          "description": "min=0.629, mean=0.629, max=0.629, sum=0.629 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 0.6858245756174158,
          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 0.19510428131457905,
          "description": "min=0.195, mean=0.195, max=0.195, sum=0.195 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 0.398,
          "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 0.3856491228070175,
          "description": "min=0.25, mean=0.386, max=0.57, sum=1.928 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 0.09813347720324464,
          "description": "min=0.026, mean=0.098, max=0.167, sum=0.687 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 0.149,
          "description": "min=0.149, mean=0.149, max=0.149, sum=0.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 0.3966927589507049,
          "description": "min=0.173, mean=0.397, max=0.874, sum=1.983 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 0.3121272365805169,
          "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 0.022554531557801464,
          "description": "min=0.0, mean=0.023, max=0.064, sum=0.113 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        }
      ],
      [
        {
          "value": "Command R",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3336496836496836,
          "markdown": false
        },
        {
          "value": 0.7417055733551178,
          "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.7204548439351282,
          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 0.35223443804297827,
          "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 0.782,
          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.5672280701754386,
          "description": "min=0.33, mean=0.567, max=0.82, sum=2.836 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.2658869893876014,
          "description": "min=0.158, mean=0.266, max=0.333, sum=1.861 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 0.551,
          "description": "min=0.551, mean=0.551, max=0.551, sum=0.551 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 0.506716010735285,
          "description": "min=0.211, mean=0.507, max=0.905, sum=2.534 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 0.5546719681908548,
          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.14926381105836087,
          "description": "min=0.107, mean=0.149, max=0.175, sum=0.746 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        }
      ],
      [
        {
          "value": "Command R Plus",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.48916083916083913,
          "markdown": false
        },
        {
          "value": 0.7352347742859234,
          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.7113734272191162,
          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.3431863421631031,
          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.828,
          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.590280701754386,
          "description": "min=0.21, mean=0.59, max=0.89, sum=2.951 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.4031239647518717,
          "description": "min=0.25, mean=0.403, max=0.607, sum=2.822 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.738,
          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 0.6715721169408535,
          "description": "min=0.428, mean=0.672, max=0.947, sum=3.358 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.5666003976143141,
          "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.20330294109057978,
          "description": "min=0.156, mean=0.203, max=0.233, sum=1.017 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.0 Pro (002)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.46631701631701633,
          "markdown": false
        },
        {
          "value": 0.7506089527885245,
          "description": "min=0.751, mean=0.751, max=0.751, sum=0.751 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.7141677669834366,
          "description": "min=0.714, mean=0.714, max=0.714, sum=0.714 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.3905657523185001,
          "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.788,
          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.5344912280701755,
          "description": "min=0.27, mean=0.534, max=0.81, sum=2.672 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.664858155127433,
          "description": "min=0.553, mean=0.665, max=0.859, sum=4.654 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.816,
          "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 0.47512742034143357,
          "description": "min=0.118, mean=0.475, max=0.811, sum=2.376 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.4831013916500994,
          "description": "min=0.483, mean=0.483, max=0.483, sum=0.483 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.19443454427557416,
          "description": "min=0.144, mean=0.194, max=0.231, sum=0.972 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7818015318015318,
          "markdown": false
        },
        {
          "value": 0.7825409539853793,
          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7481250574384422,
          "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.37802870840922326,
          "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.902,
          "description": "min=0.902, mean=0.902, max=0.902, sum=0.902 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7716140350877194,
          "description": "min=0.62, mean=0.772, max=0.93, sum=3.858 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.8247352131073062,
          "description": "min=0.692, mean=0.825, max=0.956, sum=5.773 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.836,
          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 0.7572377274443406,
          "description": "min=0.46, mean=0.757, max=1, sum=3.786 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6918489065606361,
          "description": "min=0.692, mean=0.692, max=0.692, sum=0.692 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.18929981926383332,
          "description": "min=0.118, mean=0.189, max=0.252, sum=0.946 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Flash (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7161838161838162,
          "markdown": false
        },
        {
          "value": 0.7828677496382116,
          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.7232397428872679,
          "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.33241428445406646,
          "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.928,
          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.7028070175438597,
          "description": "min=0.58, mean=0.703, max=0.93, sum=3.514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.752668194039063,
          "description": "min=0.632, mean=0.753, max=0.889, sum=5.269 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.785,
          "description": "min=0.785, mean=0.785, max=0.785, sum=0.785 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 0.6610464918621973,
          "description": "min=0.425, mean=0.661, max=0.968, sum=3.305 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.679920477137177,
          "description": "min=0.68, mean=0.68, max=0.68, sum=0.68 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.2251030248644216,
          "description": "min=0.186, mean=0.225, max=0.253, sum=1.126 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Bison)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5677405927405927,
          "markdown": false
        },
        {
          "value": 0.7180180202611235,
          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.812636595594866,
          "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.38995180850624384,
          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.878,
          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.6075087719298246,
          "description": "min=0.39, mean=0.608, max=0.87, sum=3.038 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.4209237315112468,
          "description": "min=0.25, mean=0.421, max=0.558, sum=2.946 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.61,
          "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.6447033707273244,
          "description": "min=0.466, mean=0.645, max=0.937, sum=3.224 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.5467196819085487,
          "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.2408179397122467,
          "description": "min=0.22, mean=0.241, max=0.255, sum=1.204 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Unicorn)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6811355311355312,
          "markdown": false
        },
        {
          "value": 0.5828773710219437,
          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.6742427211512454,
          "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.4348175818600496,
          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.938,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.7018245614035088,
          "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.6736886940069314,
          "description": "min=0.526, mean=0.674, max=0.867, sum=4.716 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.831,
          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.6773014537121316,
          "description": "min=0.452, mean=0.677, max=0.926, sum=3.387 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.68389662027833,
          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.25952565257738897,
          "description": "min=0.236, mean=0.26, max=0.279, sum=1.298 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        }
      ],
      [
        {
          "value": "Yi Large (Preview)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.512087912087912,
          "markdown": false
        },
        {
          "value": 0.37282202834298434,
          "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.5864410195534986,
          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.42769608198028625,
          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.946,
          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.7116140350877193,
          "description": "min=0.52, mean=0.712, max=0.86, sum=3.558 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.711688757771989,
          "description": "min=0.553, mean=0.712, max=0.874, sum=4.982 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.69,
          "description": "min=0.69, mean=0.69, max=0.69, sum=0.69 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 0.5188015622942135,
          "description": "min=0.145, mean=0.519, max=0.884, sum=2.594 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.6600397614314115,
          "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.17606426591097207,
          "description": "min=0.126, mean=0.176, max=0.218, sum=0.88 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        }
      ],
      [
        {
          "value": "Mistral Small (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3231018981018981,
          "markdown": false
        },
        {
          "value": 0.51918323792808,
          "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.5868872145834194,
          "description": "min=0.587, mean=0.587, max=0.587, sum=0.587 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.30423703035930183,
          "description": "min=0.304, mean=0.304, max=0.304, sum=0.304 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.862,
          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.5928070175438597,
          "description": "min=0.26, mean=0.593, max=0.89, sum=2.964 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.6205257229737035,
          "description": "min=0.367, mean=0.621, max=0.859, sum=4.344 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.734,
          "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.3894440556431952,
          "description": "min=0, mean=0.389, max=0.789, sum=1.947 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.6163021868787276,
          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.16864846157680483,
          "description": "min=0.076, mean=0.169, max=0.215, sum=0.843 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Medium (2312)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3031968031968032,
          "markdown": false
        },
        {
          "value": 0.449419017401787,
          "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.46849270602031157,
          "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.28968511687682613,
          "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.83,
          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.6177894736842106,
          "description": "min=0.32, mean=0.618, max=0.91, sum=3.089 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.5654665136305284,
          "description": "min=0.4, mean=0.565, max=0.756, sum=3.958 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.706,
          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.45166199773470267,
          "description": "min=0.066, mean=0.452, max=0.692, sum=2.258 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.610337972166998,
          "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.16874314003204713,
          "description": "min=0.07, mean=0.169, max=0.22, sum=0.844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.36854811854811853,
          "markdown": false
        },
        {
          "value": 0.45352242088002337,
          "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.4854292010239775,
          "description": "min=0.485, mean=0.485, max=0.485, sum=0.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.31110601485959,
          "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.894,
          "description": "min=0.894, mean=0.894, max=0.894, sum=0.894 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.6380701754385966,
          "description": "min=0.38, mean=0.638, max=0.92, sum=3.19 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.7503596249618282,
          "description": "min=0.632, mean=0.75, max=0.904, sum=5.253 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.694,
          "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.47889553993976774,
          "description": "min=0.1, mean=0.479, max=0.821, sum=2.394 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.4990059642147117,
          "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.18189117805584806,
          "description": "min=0.098, mean=0.182, max=0.224, sum=0.909 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large 2 (2407)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7877039627039627,
          "markdown": false
        },
        {
          "value": 0.7787181644901774,
          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.7343356126413015,
          "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.4526913126167187,
          "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.932,
          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.7245964912280702,
          "description": "min=0.52, mean=0.725, max=0.9, sum=3.623 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.6767085174673915,
          "description": "min=0.342, mean=0.677, max=0.881, sum=4.737 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.912,
          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 0.6459082531162472,
          "description": "min=0.229, mean=0.646, max=1, sum=3.23 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.7753479125248509,
          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.19245828846755558,
          "description": "min=0.14, mean=0.192, max=0.231, sum=0.962 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        }
      ],
      [
        {
          "value": "Mistral NeMo (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3776889776889777,
          "markdown": false
        },
        {
          "value": 0.7309946891410284,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.6498504213061129,
          "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.26478995157604784,
          "description": "min=0.265, mean=0.265, max=0.265, sum=0.265 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.822,
          "description": "min=0.822, mean=0.822, max=0.822, sum=0.822 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.6042807017543861,
          "description": "min=0.29, mean=0.604, max=0.89, sum=3.021 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.6684556003405452,
          "description": "min=0.558, mean=0.668, max=0.852, sum=4.679 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.782,
          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 0.4152472809115042,
          "description": "min=0.232, mean=0.415, max=0.758, sum=2.076 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.5904572564612326,
          "description": "min=0.59, mean=0.59, max=0.59, sum=0.59 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.17737132532046168,
          "description": "min=0.111, mean=0.177, max=0.211, sum=0.887 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-003)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4850649350649351,
          "markdown": false
        },
        {
          "value": 0.7308179266219924,
          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.7697538952991528,
          "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.4134401941760032,
          "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.828,
          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.5547368421052632,
          "description": "min=0.3, mean=0.555, max=0.83, sum=2.774 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.44940748312841333,
          "description": "min=0.3, mean=0.449, max=0.548, sum=3.146 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.615,
          "description": "min=0.615, mean=0.615, max=0.615, sum=0.615 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.6219796743708239,
          "description": "min=0.324, mean=0.622, max=0.947, sum=3.11 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.5308151093439364,
          "description": "min=0.531, mean=0.531, max=0.531, sum=0.531 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.19114446816962385,
          "description": "min=0.094, mean=0.191, max=0.227, sum=0.956 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-002)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.37459207459207455,
          "markdown": false
        },
        {
          "value": 0.7191082981361542,
          "description": "min=0.719, mean=0.719, max=0.719, sum=0.719 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.709725528129197,
          "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.39413936756218626,
          "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.796,
          "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.5682456140350877,
          "description": "min=0.26, mean=0.568, max=0.84, sum=2.841 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.42812664238612835,
          "description": "min=0.288, mean=0.428, max=0.548, sum=2.997 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.479,
          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.5802111877591993,
          "description": "min=0.326, mean=0.58, max=0.916, sum=2.901 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.5248508946322068,
          "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.17444555838609413,
          "description": "min=0.077, mean=0.174, max=0.212, sum=0.872 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 Turbo (0613)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4002830502830503,
          "markdown": false
        },
        {
          "value": 0.6545452493962763,
          "description": "min=0.655, mean=0.655, max=0.655, sum=0.655 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.6775232576261939,
          "description": "min=0.678, mean=0.678, max=0.678, sum=0.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.334590535421788,
          "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.838,
          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.614,
          "description": "min=0.38, mean=0.614, max=0.88, sum=3.07 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.6667095586312232,
          "description": "min=0.533, mean=0.667, max=0.826, sum=4.667 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.501,
          "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.5283293461368486,
          "description": "min=0.302, mean=0.528, max=0.747, sum=2.642 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.6222664015904572,
          "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.1874826961367359,
          "description": "min=0.1, mean=0.187, max=0.23, sum=0.937 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (1106 preview)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7453712953712954,
          "markdown": false
        },
        {
          "value": 0.7270706981870464,
          "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.7629015365855754,
          "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.43470596512187554,
          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.95,
          "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.6990877192982456,
          "description": "min=0.47, mean=0.699, max=0.96, sum=3.495 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.856862689420829,
          "description": "min=0.711, mean=0.857, max=0.97, sum=5.998 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.668,
          "description": "min=0.668, mean=0.668, max=0.668, sum=0.668 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.6259516326823287,
          "description": "min=0.368, mean=0.626, max=0.989, sum=3.13 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.8170974155069582,
          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.20467244349620536,
          "description": "min=0.156, mean=0.205, max=0.241, sum=1.023 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 (0613)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9089077589077589,
          "markdown": false
        },
        {
          "value": 0.7677791724251858,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.7896684683368644,
          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.4568767355321626,
          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.96,
          "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.734842105263158,
          "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.8024156744474984,
          "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.932,
          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.7127682635939792,
          "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.8151093439363817,
          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.21051123816678335,
          "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (2024-04-09)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8984015984015984,
          "markdown": false
        },
        {
          "value": 0.7611681213732712,
          "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.7952101327052921,
          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.48210297207204034,
          "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.97,
          "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.7110877192982457,
          "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.8328762923010169,
          "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.824,
          "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 0.7273177369269808,
          "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.7833001988071571,
          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.21753567757130296,
          "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9594572094572095,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.8039471342235485,
          "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.803159876040668,
          "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5013491384419004,
          "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.966,
          "description": "min=0.966, mean=0.966, max=0.966, sum=0.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.7483508771929823,
          "description": "min=0.61, mean=0.748, max=0.95, sum=3.742 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8288575493593852,
          "description": "min=0.632, mean=0.829, max=0.977, sum=5.802 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.905,
          "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 0.7331665151590536,
          "description": "min=0.441, mean=0.733, max=0.989, sum=3.666 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.856858846918489,
          "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.23070145580018325,
          "description": "min=0.176, mean=0.231, max=0.281, sum=1.154 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7568181818181818,
          "markdown": false
        },
        {
          "value": 0.7679012725768495,
          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7456842704979154,
          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.38554568146272244,
          "description": "min=0.386, mean=0.386, max=0.386, sum=0.386 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.92,
          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.6678245614035088,
          "description": "min=0.42, mean=0.668, max=0.91, sum=3.339 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.8015479714990118,
          "description": "min=0.605, mean=0.802, max=0.97, sum=5.611 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.843,
          "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 0.6526226166818369,
          "description": "min=0.414, mean=0.653, max=0.937, sum=3.263 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.7475149105367793,
          "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.20633712381155664,
          "description": "min=0.153, mean=0.206, max=0.254, sum=1.032 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V2 (33B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6397852147852148,
          "markdown": false
        },
        {
          "value": 0.7524621916104515,
          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.7515145595314991,
          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.42810099574571514,
          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.878,
          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.6212631578947369,
          "description": "min=0.37, mean=0.621, max=0.91, sum=3.106 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.5798099900915078,
          "description": "min=0.395, mean=0.58, max=0.8, sum=4.059 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.735,
          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.6442732446140653,
          "description": "min=0.33, mean=0.644, max=0.989, sum=3.221 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.5984095427435387,
          "description": "min=0.598, mean=0.598, max=0.598, sum=0.598 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.23881568531367575,
          "description": "min=0.2, mean=0.239, max=0.27, sum=1.194 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V3 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7234432234432234,
          "markdown": false
        },
        {
          "value": 0.7059408242658334,
          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.6851822065584833,
          "description": "min=0.685, mean=0.685, max=0.685, sum=0.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.40719623137956623,
          "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.938,
          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.7018245614035088,
          "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.72287040897812,
          "description": "min=0.579, mean=0.723, max=0.896, sum=5.06 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.831,
          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.7087087740761012,
          "description": "min=0.439, mean=0.709, max=0.926, sum=3.544 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.68389662027833,
          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.2618576157125692,
          "description": "min=0.235, mean=0.262, max=0.284, sum=1.309 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        }
      ],
      [
        {
          "value": "Palmyra-X-004",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8504828504828505,
          "markdown": false
        },
        {
          "value": 0.7726459909303317,
          "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.7537866041914042,
          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.45663625626438326,
          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.926,
          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.7388421052631579,
          "description": "min=0.52, mean=0.739, max=0.92, sum=3.694 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.7672380613629084,
          "description": "min=0.553, mean=0.767, max=0.948, sum=5.371 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.905,
          "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.7295161705353302,
          "description": "min=0.433, mean=0.73, max=0.989, sum=3.648 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.7753479125248509,
          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.2032628748676411,
          "description": "min=0.144, mean=0.203, max=0.249, sum=1.016 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v1.9.0/groups/latex/core_scenarios_accuracy.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v1.9.0/groups/json/core_scenarios_accuracy.json"
      }
    ],
    "name": "accuracy"
  },
  {
    "title": "Efficiency",
    "header": [
      {
        "value": "Model/adapter",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "NarrativeQA - Observed inference time (s)",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - Observed inference time (s)",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - Observed inference time (s)",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "OpenbookQA - Observed inference time (s)",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "MMLU - Observed inference time (s)",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MATH - Observed inference time (s)",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MATH"
        }
      },
      {
        "value": "GSM8K - Observed inference time (s)",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "LegalBench - Observed inference time (s)",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "MedQA - Observed inference time (s)",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "MedQA"
        }
      },
      {
        "value": "WMT 2014 - Observed inference time (s)",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\nObserved inference runtime (s): Average observed time to process a request to the model (via an API, and thus depends on particular deployment).",
        "markdown": false,
        "lower_is_better": true,
        "metadata": {
          "metric": "Observed inference time (s)",
          "run_group": "WMT 2014"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Phi-3 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.12307692307692307,
          "markdown": false
        },
        {
          "value": 29.5092350200868,
          "description": "min=29.509, mean=29.509, max=29.509, sum=29.509 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 44.23756227874756,
          "description": "min=44.238, mean=44.238, max=44.238, sum=44.238 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 49.743374599456786,
          "description": "min=49.743, mean=49.743, max=49.743, sum=49.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.3850016188621521,
          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.5039482383811682,
          "description": "min=0.381, mean=0.504, max=0.722, sum=2.52 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 71.56076915436368,
          "description": "min=67.969, mean=71.561, max=74.993, sum=500.925 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 74.93269198083877,
          "description": "min=74.933, mean=74.933, max=74.933, sum=74.933 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 7.879368148866983,
          "description": "min=5.972, mean=7.879, max=14.755, sum=39.397 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.7916561092581473,
          "description": "min=1.792, mean=1.792, max=1.792, sum=1.792 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 19.98681167411759,
          "description": "min=19.742, mean=19.987, max=20.079, sum=99.934 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        }
      ],
      [
        {
          "value": "Phi-3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.18803418803418803,
          "markdown": false
        },
        {
          "value": 30.40753108749927,
          "description": "min=30.408, mean=30.408, max=30.408, sum=30.408 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 68.2322377743721,
          "description": "min=68.232, mean=68.232, max=68.232, sum=68.232 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 63.00250503087044,
          "description": "min=63.003, mean=63.003, max=63.003, sum=63.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.28856802701950074,
          "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.406433069689232,
          "description": "min=0.275, mean=0.406, max=0.549, sum=2.032 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 60.680695580739844,
          "description": "min=49.379, mean=60.681, max=73.413, sum=424.765 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "value": 8.34200078530511,
          "description": "min=6.293, mean=8.342, max=16.012, sum=41.71 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.8902683931126983,
          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 20.399208641134514,
          "description": "min=20.252, mean=20.399, max=20.714, sum=101.996 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        }
      ],
      [
        {
          "value": "DBRX Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5470362970362971,
          "markdown": false
        },
        {
          "value": 1.6445875322315056,
          "description": "min=1.645, mean=1.645, max=1.645, sum=1.645 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.1746999933719635,
          "description": "min=1.175, mean=1.175, max=1.175, sum=1.175 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.6648788969516755,
          "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.3277706532478333,
          "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.41247134314921857,
          "description": "min=0.39, mean=0.412, max=0.432, sum=2.062 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 2.305378989452493,
          "description": "min=0.531, mean=2.305, max=3.852, sum=16.138 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 2.3839432048797606,
          "description": "min=2.384, mean=2.384, max=2.384, sum=2.384 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.73349196183029,
          "description": "min=0.366, mean=0.733, max=1.771, sum=3.667 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.4383622557221066,
          "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0594140760888837,
          "description": "min=0.849, mean=1.059, max=1.342, sum=5.297 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek LLM Chat (67B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.31565101565101567,
          "markdown": false
        },
        {
          "value": 3.359551859573579,
          "description": "min=3.36, mean=3.36, max=3.36, sum=3.36 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 2.2367931361198425,
          "description": "min=2.237, mean=2.237, max=2.237, sum=2.237 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.8567402980327606,
          "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.41702947664260864,
          "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.508463426874395,
          "description": "min=0.48, mean=0.508, max=0.551, sum=2.542 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 4.442596748084942,
          "description": "min=3.389, mean=4.443, max=6.234, sum=31.098 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.876643376111984,
          "description": "min=5.877, mean=5.877, max=5.877, sum=5.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.9420770218153176,
          "description": "min=0.524, mean=0.942, max=2.301, sum=4.71 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.8296676231899982,
          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.429440071817079,
          "description": "min=1.381, mean=1.429, max=1.464, sum=7.147 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        }
      ],
      [
        {
          "value": "Falcon (40B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.0923909423909424,
          "markdown": false
        },
        {
          "value": 4.985411514362819,
          "description": "min=4.985, mean=4.985, max=4.985, sum=4.985 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 3.184468511581421,
          "description": "min=3.184, mean=3.184, max=3.184, sum=3.184 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 2.848947753429413,
          "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.268236391544342,
          "description": "min=1.268, mean=1.268, max=1.268, sum=1.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.4308063889804639,
          "description": "min=1.176, mean=1.431, max=1.805, sum=7.154 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 11.413689562224084,
          "description": "min=7.555, mean=11.414, max=18.723, sum=79.896 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 12.967224577903748,
          "description": "min=12.967, mean=12.967, max=12.967, sum=12.967 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.730808089747147,
          "description": "min=1.333, mean=1.731, max=3.174, sum=8.654 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 2.202825612149703,
          "description": "min=2.203, mean=2.203, max=2.203, sum=2.203 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 3.0981059579736714,
          "description": "min=2.468, mean=3.098, max=4.642, sum=15.491 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        }
      ],
      [
        {
          "value": "Falcon (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3886613386613387,
          "markdown": false
        },
        {
          "value": 1.1411562691272144,
          "description": "min=1.141, mean=1.141, max=1.141, sum=1.141 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0090243232250213,
          "description": "min=1.009, mean=1.009, max=1.009, sum=1.009 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.8758702797889709,
          "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.4118037748336792,
          "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.47453500427279555,
          "description": "min=0.434, mean=0.475, max=0.497, sum=2.373 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 6.987098801445013,
          "description": "min=5.445, mean=6.987, max=10.873, sum=48.91 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 6.940216990470886,
          "description": "min=6.94, mean=6.94, max=6.94, sum=6.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.6278266410596228,
          "description": "min=0.453, mean=0.628, max=1.041, sum=3.139 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.7352914724861889,
          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.6038075838932468,
          "description": "min=1.05, mean=1.604, max=3.055, sum=8.019 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (27B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7662837162837163,
          "markdown": false
        },
        {
          "value": 0.6603116545878666,
          "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.4863240420818329,
          "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.35805381870269776,
          "description": "min=0.358, mean=0.358, max=0.358, sum=0.358 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.3270734968185425,
          "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.3286796834259702,
          "description": "min=0.317, mean=0.329, max=0.337, sum=1.643 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.9034432935092742,
          "description": "min=1.515, mean=1.903, max=2.648, sum=13.324 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 2.3315503742694856,
          "description": "min=2.332, mean=2.332, max=2.332, sum=2.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.4403507251683155,
          "description": "min=0.328, mean=0.44, max=0.796, sum=2.202 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.4512898187277094,
          "description": "min=0.451, mean=0.451, max=0.451, sum=0.451 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.6983992647690125,
          "description": "min=0.666, mean=0.698, max=0.715, sum=3.492 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (9B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.86005661005661,
          "markdown": false
        },
        {
          "value": 0.5928616705075116,
          "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.44568803215026853,
          "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.337234415769577,
          "description": "min=0.337, mean=0.337, max=0.337, sum=0.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.3059106550216675,
          "description": "min=0.306, mean=0.306, max=0.306, sum=0.306 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.3187573717686168,
          "description": "min=0.299, mean=0.319, max=0.334, sum=1.594 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.3440718759718908,
          "description": "min=1.006, mean=1.344, max=1.765, sum=9.409 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.720498773097992,
          "description": "min=1.72, mean=1.72, max=1.72, sum=1.72 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 0.3840073023663075,
          "description": "min=0.31, mean=0.384, max=0.652, sum=1.92 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.3161872125288127,
          "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.6330890842213928,
          "description": "min=0.526, mean=0.633, max=0.82, sum=3.165 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8150349650349651,
          "markdown": false
        },
        {
          "value": 0.9086058952438999,
          "description": "min=0.909, mean=0.909, max=0.909, sum=0.909 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5911745510101318,
          "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.3430815353393555,
          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.28152281618118286,
          "description": "min=0.282, mean=0.282, max=0.282, sum=0.282 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.27346607242550763,
          "description": "min=0.251, mean=0.273, max=0.293, sum=1.367 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.1609408722047545,
          "description": "min=0.995, mean=1.161, max=1.453, sum=8.127 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 2.024561887741089,
          "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5303036133605687,
          "description": "min=0.295, mean=0.53, max=1.42, sum=2.652 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.3144090270427302,
          "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.5235538594776801,
          "description": "min=0.503, mean=0.524, max=0.541, sum=2.618 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (13B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7523476523476523,
          "markdown": false
        },
        {
          "value": 0.7950913200915699,
          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.5793666501045227,
          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.3839698841571808,
          "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.34700755834579466,
          "description": "min=0.347, mean=0.347, max=0.347, sum=0.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.37437369656144526,
          "description": "min=0.359, mean=0.374, max=0.383, sum=1.872 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.5161172209789922,
          "description": "min=1.083, mean=1.516, max=1.771, sum=10.613 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.7367573575973512,
          "description": "min=1.737, mean=1.737, max=1.737, sum=1.737 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.43780977145306127,
          "description": "min=0.331, mean=0.438, max=0.729, sum=2.189 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.4588449499005115,
          "description": "min=0.459, mean=0.459, max=0.459, sum=0.459 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.6911807014709866,
          "description": "min=0.557, mean=0.691, max=0.814, sum=3.456 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4095904095904096,
          "markdown": false
        },
        {
          "value": 1.8709671289148464,
          "description": "min=1.871, mean=1.871, max=1.871, sum=1.871 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.277897496700287,
          "description": "min=1.278, mean=1.278, max=1.278, sum=1.278 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.8177921280860901,
          "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.6557973260879517,
          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.5013968416013215,
          "description": "min=0.465, mean=0.501, max=0.56, sum=2.507 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 2.4432508421434598,
          "description": "min=1.813, mean=2.443, max=3.147, sum=17.103 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 3.737159442663193,
          "description": "min=3.737, mean=3.737, max=3.737, sum=3.737 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.7591354159811778,
          "description": "min=0.448, mean=0.759, max=1.744, sum=3.796 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.9713700282170806,
          "description": "min=0.971, mean=0.971, max=0.971, sum=0.971 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0736038563633745,
          "description": "min=0.809, mean=1.074, max=1.477, sum=5.368 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6945554445554446,
          "markdown": false
        },
        {
          "value": 0.8524049973823655,
          "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.584290323972702,
          "description": "min=0.584, mean=0.584, max=0.584, sum=0.584 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.47909903168678286,
          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.3927152595520019,
          "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.33028721380233766,
          "description": "min=0.314, mean=0.33, max=0.349, sum=1.651 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 2.6600816047289086,
          "description": "min=1.362, mean=2.66, max=5.271, sum=18.621 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.95984334897995,
          "description": "min=1.96, mean=1.96, max=1.96, sum=1.96 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.4277655324222306,
          "description": "min=0.306, mean=0.428, max=0.76, sum=2.139 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.46650436763497993,
          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.697166075241057,
          "description": "min=0.582, mean=0.697, max=0.802, sum=3.486 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4107725607725608,
          "markdown": false
        },
        {
          "value": 1.7946508300136512,
          "description": "min=1.795, mean=1.795, max=1.795, sum=1.795 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.211742308139801,
          "description": "min=1.212, mean=1.212, max=1.212, sum=1.212 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.5584413967132569,
          "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.35184384298324584,
          "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.40422279727668087,
          "description": "min=0.387, mean=0.404, max=0.432, sum=2.021 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 15.818764438908431,
          "description": "min=14.895, mean=15.819, max=17.569, sum=110.731 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 4.199564570903778,
          "description": "min=4.2, mean=4.2, max=4.2, sum=4.2 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.8703131128024035,
          "description": "min=0.416, mean=0.87, max=2.556, sum=4.352 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.547684069419239,
          "description": "min=0.548, mean=0.548, max=0.548, sum=0.548 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.239086973613365,
          "description": "min=1.198, mean=1.239, max=1.282, sum=6.195 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (8B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7458874458874459,
          "markdown": false
        },
        {
          "value": 0.7260531909029249,
          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.523505747795105,
          "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.42760186743736267,
          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.3076804256439209,
          "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.3165063006919727,
          "description": "min=0.3, mean=0.317, max=0.344, sum=1.583 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.651119198181415,
          "description": "min=5.431, mean=5.651, max=6.121, sum=39.558 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.770608879327774,
          "description": "min=1.771, mean=1.771, max=1.771, sum=1.771 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.4651390315970952,
          "description": "min=0.322, mean=0.465, max=0.989, sum=2.326 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.36141945306159867,
          "description": "min=0.361, mean=0.361, max=0.361, sum=0.361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.5631435248437351,
          "description": "min=0.547, mean=0.563, max=0.573, sum=2.816 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (405B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.32387612387612386,
          "markdown": false
        },
        {
          "value": 2.964381891572979,
          "description": "min=2.964, mean=2.964, max=2.964, sum=2.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 4.104731038570404,
          "description": "min=4.105, mean=4.105, max=4.105, sum=4.105 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.9464026074409485,
          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 2.6930377073287963,
          "description": "min=2.693, mean=2.693, max=2.693, sum=2.693 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.528599283887629,
          "description": "min=0.464, mean=0.529, max=0.598, sum=2.643 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 4.117939187053165,
          "description": "min=3.188, mean=4.118, max=4.906, sum=28.826 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 2.737115991592407,
          "description": "min=2.737, mean=2.737, max=2.737, sum=2.737 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.7974352428433198,
          "description": "min=0.492, mean=0.797, max=1.89, sum=3.987 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.9505775325577965,
          "description": "min=0.951, mean=0.951, max=0.951, sum=0.951 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1.0554436480227387,
          "description": "min=0.96, mean=1.055, max=1.147, sum=5.277 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.1427072927072927,
          "markdown": false
        },
        {
          "value": 3.4022000312805174,
          "description": "min=3.402, mean=3.402, max=3.402, sum=3.402 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 3.354476467370987,
          "description": "min=3.354, mean=3.354, max=3.354, sum=3.354 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 3.534221899032593,
          "description": "min=3.534, mean=3.534, max=3.534, sum=3.534 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 3.8750249314308167,
          "description": "min=3.875, mean=3.875, max=3.875, sum=3.875 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 12.026294649132511,
          "description": "min=2.836, mean=12.026, max=45.251, sum=60.131 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 6.527233472429779,
          "description": "min=5.784, mean=6.527, max=7.228, sum=45.691 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 4.9902911036014554,
          "description": "min=4.99, mean=4.99, max=4.99, sum=4.99 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 3.1709040240543165,
          "description": "min=2.233, mean=3.171, max=3.636, sum=15.855 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 3.0525233205222704,
          "description": "min=3.053, mean=3.053, max=3.053, sum=3.053 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.9648550899177766,
          "description": "min=0.762, mean=0.965, max=1.177, sum=4.824 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (8B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6163003663003663,
          "markdown": false
        },
        {
          "value": 0.5813529316808136,
          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.5441543731689453,
          "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.751717613697052,
          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 2.9374450149536133,
          "description": "min=2.937, mean=2.937, max=2.937, sum=2.937 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.41729471965421716,
          "description": "min=0.284, mean=0.417, max=0.567, sum=2.086 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1.9274194573191807,
          "description": "min=1.617, mean=1.927, max=2.175, sum=13.492 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 2.108796592712402,
          "description": "min=2.109, mean=2.109, max=2.109, sum=2.109 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.4814103188942614,
          "description": "min=0.409, mean=0.481, max=0.626, sum=2.407 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.742541556803891,
          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.5651802479746801,
          "description": "min=0.439, mean=0.565, max=0.727, sum=2.826 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (11B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8999333999333999,
          "markdown": false
        },
        {
          "value": 0.37828690300525075,
          "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.28472757744789123,
          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.32630494999885556,
          "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.21042356300354004,
          "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.40622414255142214,
          "description": "min=0.226, mean=0.406, max=0.726, sum=2.031 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 2.099496145662431,
          "description": "min=1.715, mean=2.099, max=2.413, sum=14.696 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1.2738200931549073,
          "description": "min=1.274, mean=1.274, max=1.274, sum=1.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.2767821625533402,
          "description": "min=0.199, mean=0.277, max=0.438, sum=1.384 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.20540714263916016,
          "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.38295877939459017,
          "description": "min=0.349, mean=0.383, max=0.412, sum=1.915 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (90B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6097569097569098,
          "markdown": false
        },
        {
          "value": 0.8297326531208736,
          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1.110703297138214,
          "description": "min=1.111, mean=1.111, max=1.111, sum=1.111 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.4218848171234131,
          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.28476666021347047,
          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.7984467656654225,
          "description": "min=0.266, mean=0.798, max=2.612, sum=3.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.739186799526185,
          "description": "min=4.64, mean=5.739, max=6.652, sum=40.174 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 2.8894128675460817,
          "description": "min=2.889, mean=2.889, max=2.889, sum=2.889 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.47773526830658064,
          "description": "min=0.284, mean=0.478, max=1.152, sum=2.389 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.3180293652930743,
          "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.8156762526912515,
          "description": "min=0.737, mean=0.816, max=0.848, sum=4.078 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "LLaMA (65B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.07958707958707958,
          "markdown": false
        },
        {
          "value": 2.9087761751362975,
          "description": "min=2.909, mean=2.909, max=2.909, sum=2.909 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1.3611893365383148,
          "description": "min=1.361, mean=1.361, max=1.361, sum=1.361 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 4.703710767745972,
          "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 4.490233006477356,
          "description": "min=4.49, mean=4.49, max=4.49, sum=4.49 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 3.925460591943641,
          "description": "min=1.962, mean=3.925, max=5.875, sum=19.627 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 20.790176352238564,
          "description": "min=13.711, mean=20.79, max=30.888, sum=145.531 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 12.338884568691254,
          "description": "min=12.339, mean=12.339, max=12.339, sum=12.339 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 3.9735240905509466,
          "description": "min=1.489, mean=3.974, max=6.264, sum=19.868 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 4.983887912264875,
          "description": "min=4.984, mean=4.984, max=4.984, sum=4.984 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 3.6028029962680237,
          "description": "min=2.057, mean=3.603, max=8.087, sum=18.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        }
      ],
      [
        {
          "value": "Mistral Instruct v0.3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6787545787545788,
          "markdown": false
        },
        {
          "value": 0.8132137520212522,
          "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.5634698050022126,
          "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.5347676448822022,
          "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.25593132400512697,
          "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.37230395750413864,
          "description": "min=0.221, mean=0.372, max=0.487, sum=1.862 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 2.656151831465352,
          "description": "min=2.027, mean=2.656, max=3.039, sum=18.593 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 3.949965229511261,
          "description": "min=3.95, mean=3.95, max=3.95, sum=3.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 0.4887186054518059,
          "description": "min=0.316, mean=0.489, max=0.855, sum=2.444 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.4182186216767692,
          "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.7750062139801958,
          "description": "min=0.582, mean=0.775, max=0.872, sum=3.875 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        }
      ],
      [
        {
          "value": "Mistral v0.1 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.837012987012987,
          "markdown": false
        },
        {
          "value": 0.7051956902087574,
          "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.49417281556129455,
          "description": "min=0.494, mean=0.494, max=0.494, sum=0.494 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.46181689071655274,
          "description": "min=0.462, mean=0.462, max=0.462, sum=0.462 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.32474704647064206,
          "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.2914179778851961,
          "description": "min=0.272, mean=0.291, max=0.304, sum=1.457 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.159214100149656,
          "description": "min=0.992, mean=1.159, max=1.576, sum=8.114 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.6323128745555877,
          "description": "min=1.632, mean=1.632, max=1.632, sum=1.632 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.35307050709631943,
          "description": "min=0.287, mean=0.353, max=0.577, sum=1.765 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.3478535307093596,
          "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.5605853292576617,
          "description": "min=0.52, mean=0.561, max=0.701, sum=2.803 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x22B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5583416583416583,
          "markdown": false
        },
        {
          "value": 1.477503587158633,
          "description": "min=1.478, mean=1.478, max=1.478, sum=1.478 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.003950766324997,
          "description": "min=1.004, mean=1.004, max=1.004, sum=1.004 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.44196626234054565,
          "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.33846320056915286,
          "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.344487278235586,
          "description": "min=0.313, mean=0.344, max=0.359, sum=1.722 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 2.5093491334109825,
          "description": "min=2.009, mean=2.509, max=3.121, sum=17.565 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 3.5390553929805755,
          "description": "min=3.539, mean=3.539, max=3.539, sum=3.539 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.8213642223004287,
          "description": "min=0.372, mean=0.821, max=1.973, sum=4.107 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.46328771849038825,
          "description": "min=0.463, mean=0.463, max=0.463, sum=0.463 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.9626315307056144,
          "description": "min=0.928, mean=0.963, max=0.982, sum=4.813 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x7B 32K seqlen)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.701964701964702,
          "markdown": false
        },
        {
          "value": 0.649569604766201,
          "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.507013471364975,
          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.5133386459350586,
          "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.3542211503982544,
          "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.3604579553102192,
          "description": "min=0.355, mean=0.36, max=0.366, sum=1.802 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.527861329055259,
          "description": "min=1.128, mean=1.528, max=2.033, sum=10.695 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 3.2728567245006563,
          "description": "min=3.273, mean=3.273, max=3.273, sum=3.273 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.40995627823211056,
          "description": "min=0.369, mean=0.41, max=0.512, sum=2.05 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.35297762423338996,
          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.2021687407719377,
          "description": "min=1.115, mean=1.202, max=1.294, sum=6.011 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        }
      ],
      [
        {
          "value": "OLMo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6752247752247752,
          "markdown": false
        },
        {
          "value": 1.0318688553823552,
          "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.9419968054294586,
          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.3968301827907562,
          "description": "min=0.397, mean=0.397, max=0.397, sum=0.397 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.2902843647003174,
          "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.325820258140564,
          "description": "min=0.309, mean=0.326, max=0.346, sum=1.629 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 2.2571195842818583,
          "description": "min=1.79, mean=2.257, max=2.808, sum=15.8 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 2.4104921889305113,
          "description": "min=2.41, mean=2.41, max=2.41, sum=2.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.5016753114389487,
          "description": "min=0.368, mean=0.502, max=0.929, sum=2.508 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.47797848879698496,
          "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.7709201743273374,
          "description": "min=0.661, mean=0.771, max=0.925, sum=3.855 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        }
      ],
      [
        {
          "value": "Phi-2",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9307359307359307,
          "markdown": false
        },
        {
          "value": 0.49325697791408485,
          "description": "min=0.493, mean=0.493, max=0.493, sum=0.493 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.46984758591651915,
          "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.29179329943656923,
          "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.2615062308311462,
          "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.28525047320650343,
          "description": "min=0.27, mean=0.285, max=0.295, sum=1.426 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.1288332585709453,
          "description": "min=0.923, mean=1.129, max=1.577, sum=7.902 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.1468114259243012,
          "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.3034723702962031,
          "description": "min=0.268, mean=0.303, max=0.381, sum=1.517 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.27509861532783886,
          "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.47001117224047206,
          "description": "min=0.427, mean=0.47, max=0.534, sum=2.35 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 Chat (110B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6889443889443889,
          "markdown": false
        },
        {
          "value": 0.9843533623386437,
          "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.6468759918212891,
          "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.46513359355926515,
          "description": "min=0.465, mean=0.465, max=0.465, sum=0.465 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.24445231294631958,
          "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.2482092388136345,
          "description": "min=0.229, mean=0.248, max=0.277, sum=1.241 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 3.9885726889236994,
          "description": "min=2.984, mean=3.989, max=5.0, sum=27.92 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 4.537143226146698,
          "description": "min=4.537, mean=4.537, max=4.537, sum=4.537 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 0.4986402694478536,
          "description": "min=0.271, mean=0.499, max=1.328, sum=2.493 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.2881786700034473,
          "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.882270189100544,
          "description": "min=0.839, mean=0.882, max=0.896, sum=4.411 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7201798201798202,
          "markdown": false
        },
        {
          "value": 0.986717187183004,
          "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.6790921592712402,
          "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.3734231026172638,
          "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.2849515151977539,
          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.30986739750075765,
          "description": "min=0.285, mean=0.31, max=0.335, sum=1.549 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 4.931704092498438,
          "description": "min=4.789, mean=4.932, max=5.055, sum=34.522 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.965628466129303,
          "description": "min=1.966, mean=1.966, max=1.966, sum=1.966 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.5443530451858324,
          "description": "min=0.332, mean=0.544, max=1.352, sum=2.722 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.3256318408025662,
          "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.606455911532908,
          "description": "min=0.59, mean=0.606, max=0.617, sum=3.032 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (32B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.49938394938394937,
          "markdown": false
        },
        {
          "value": 1.847580643774758,
          "description": "min=1.848, mean=1.848, max=1.848, sum=1.848 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.1394575798511506,
          "description": "min=1.139, mean=1.139, max=1.139, sum=1.139 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.457463458776474,
          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.3515647969245911,
          "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.34482146733267266,
          "description": "min=0.337, mean=0.345, max=0.367, sum=1.724 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 9.436887120006455,
          "description": "min=8.668, mean=9.437, max=10.496, sum=66.058 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 3.405816124200821,
          "description": "min=3.406, mean=3.406, max=3.406, sum=3.406 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.7894946821991368,
          "description": "min=0.371, mean=0.789, max=2.33, sum=3.947 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.4515474046437925,
          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.9200148107330449,
          "description": "min=0.902, mean=0.92, max=0.952, sum=4.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4068764568764569,
          "markdown": false
        },
        {
          "value": 2.4371175302586083,
          "description": "min=2.437, mean=2.437, max=2.437, sum=2.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.4208379020690918,
          "description": "min=1.421, mean=1.421, max=1.421, sum=1.421 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.5770996954441071,
          "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.3381467695236206,
          "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.3638015921659637,
          "description": "min=0.338, mean=0.364, max=0.396, sum=1.819 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 11.812623854443027,
          "description": "min=10.776, mean=11.813, max=12.91, sum=82.688 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 4.5866835827827455,
          "description": "min=4.587, mean=4.587, max=4.587, sum=4.587 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.8783966223148776,
          "description": "min=0.426, mean=0.878, max=1.58, sum=4.392 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.5430597031329782,
          "description": "min=0.543, mean=0.543, max=0.543, sum=0.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.1866255830765444,
          "description": "min=1.148, mean=1.187, max=1.205, sum=5.933 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.8345321345321345,
          "markdown": false
        },
        {
          "value": 0.8547548650016248,
          "description": "min=0.855, mean=0.855, max=0.855, sum=0.855 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.4786673946380615,
          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.354404949426651,
          "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.2806105532646179,
          "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.28946571837810053,
          "description": "min=0.281, mean=0.289, max=0.298, sum=1.447 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 2.9328109453469335,
          "description": "min=2.593, mean=2.933, max=3.209, sum=20.53 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.380831289768219,
          "description": "min=1.381, mean=1.381, max=1.381, sum=1.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.4400657887452306,
          "description": "min=0.298, mean=0.44, max=0.946, sum=2.2 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.2983713296962306,
          "description": "min=0.298, mean=0.298, max=0.298, sum=0.298 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.4841760334465496,
          "description": "min=0.461, mean=0.484, max=0.517, sum=2.421 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        }
      ],
      [
        {
          "value": "Qwen2 Instruct (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6143523143523143,
          "markdown": false
        },
        {
          "value": 1.1896146727279877,
          "description": "min=1.19, mean=1.19, max=1.19, sum=1.19 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.8683992192745209,
          "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.35628414297103883,
          "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.21781798839569091,
          "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.2769099538284435,
          "description": "min=0.195, mean=0.277, max=0.395, sum=1.385 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 4.461141077844028,
          "description": "min=3.599, mean=4.461, max=5.828, sum=31.228 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 6.592170278310776,
          "description": "min=6.592, mean=6.592, max=6.592, sum=6.592 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 0.5210018908984072,
          "description": "min=0.233, mean=0.521, max=1.575, sum=2.605 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.5349795590812122,
          "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.8269615642193179,
          "description": "min=0.802, mean=0.827, max=0.86, sum=4.135 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6085081585081585,
          "markdown": false
        },
        {
          "value": 0.8528219290182624,
          "description": "min=0.853, mean=0.853, max=0.853, sum=0.853 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.9738211624622345,
          "description": "min=0.974, mean=0.974, max=0.974, sum=0.974 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.5063141629695892,
          "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.3723496675491333,
          "description": "min=0.372, mean=0.372, max=0.372, sum=0.372 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.5848997679509614,
          "description": "min=0.438, mean=0.585, max=0.815, sum=2.924 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 6.366941373965945,
          "description": "min=3.874, mean=6.367, max=11.192, sum=44.569 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 2.5583292784690856,
          "description": "min=2.558, mean=2.558, max=2.558, sum=2.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.44489043568091446,
          "description": "min=0.306, mean=0.445, max=0.944, sum=2.224 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.33223102912751157,
          "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.6702916101891663,
          "description": "min=0.635, mean=0.67, max=0.752, sum=3.351 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9126373626373626,
          "markdown": false
        },
        {
          "value": 0.5156192410160119,
          "description": "min=0.516, mean=0.516, max=0.516, sum=0.516 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.30121764993667605,
          "description": "min=0.301, mean=0.301, max=0.301, sum=0.301 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.21686342740058898,
          "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.1863201789855957,
          "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.35013260537699653,
          "description": "min=0.285, mean=0.35, max=0.431, sum=1.751 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.8253796190803115,
          "description": "min=1.449, mean=1.825, max=2.345, sum=12.778 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.7000067098140716,
          "description": "min=1.7, mean=1.7, max=1.7, sum=1.7 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.2609495958632719,
          "description": "min=0.183, mean=0.261, max=0.489, sum=1.305 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.20058301760709546,
          "description": "min=0.201, mean=0.201, max=0.201, sum=0.201 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.3759268445955365,
          "description": "min=0.346, mean=0.376, max=0.414, sum=1.88 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Arctic Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.793073593073593,
          "markdown": false
        },
        {
          "value": 0.6239793220036466,
          "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.6355201268196106,
          "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.4687326259613037,
          "description": "min=0.469, mean=0.469, max=0.469, sum=0.469 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.2840936713218689,
          "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.30325288054817606,
          "description": "min=0.293, mean=0.303, max=0.317, sum=1.516 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1.723981539653867,
          "description": "min=1.482, mean=1.724, max=1.995, sum=12.068 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 2.9610197002887726,
          "description": "min=2.961, mean=2.961, max=2.961, sum=2.961 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 0.34576316386866485,
          "description": "min=0.292, mean=0.346, max=0.462, sum=1.729 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.31300480038697864,
          "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.681007040066764,
          "description": "min=0.65, mean=0.681, max=0.702, sum=3.405 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        }
      ],
      [
        {
          "value": "Yi (34B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2837662337662338,
          "markdown": false
        },
        {
          "value": 2.368284817816506,
          "description": "min=2.368, mean=2.368, max=2.368, sum=2.368 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.8157690076828004,
          "description": "min=1.816, mean=1.816, max=1.816, sum=1.816 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.4578230485916137,
          "description": "min=1.458, mean=1.458, max=1.458, sum=1.458 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.8229070715904235,
          "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.6972272023485417,
          "description": "min=0.511, mean=0.697, max=0.925, sum=3.486 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 3.809198633421,
          "description": "min=2.651, mean=3.809, max=4.649, sum=26.664 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 4.886563032150269,
          "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.8004560962069804,
          "description": "min=0.465, mean=0.8, max=1.207, sum=4.002 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.064007310696672,
          "description": "min=1.064, mean=1.064, max=1.064, sum=1.064 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.4042062711970469,
          "description": "min=1.071, mean=1.404, max=2.506, sum=7.021 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        }
      ],
      [
        {
          "value": "Yi (6B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6894605394605394,
          "markdown": false
        },
        {
          "value": 1.4038719868995775,
          "description": "min=1.404, mean=1.404, max=1.404, sum=1.404 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.9108293209075927,
          "description": "min=0.911, mean=0.911, max=0.911, sum=0.911 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.4127621691226959,
          "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.3535394024848938,
          "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.3391338364283244,
          "description": "min=0.323, mean=0.339, max=0.368, sum=1.696 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.8371926514375443,
          "description": "min=1.167, mean=1.837, max=2.263, sum=12.86 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.8781680135726928,
          "description": "min=1.878, mean=1.878, max=1.878, sum=1.878 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.5528668178286933,
          "description": "min=0.379, mean=0.553, max=1.149, sum=2.764 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.4053303655051806,
          "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.6257070175426044,
          "description": "min=0.602, mean=0.626, max=0.666, sum=3.129 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Grande (17B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4234099234099234,
          "markdown": false
        },
        {
          "value": 1.1790085772393455,
          "description": "min=1.179, mean=1.179, max=1.179, sum=1.179 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 1.4618877012729645,
          "description": "min=1.462, mean=1.462, max=1.462, sum=1.462 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.630548656463623,
          "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.519375147819519,
          "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.6205235414421348,
          "description": "min=0.549, mean=0.621, max=0.755, sum=3.103 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 4.862255273244342,
          "description": "min=2.609, mean=4.862, max=6.298, sum=34.036 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.417125414848328,
          "description": "min=5.417, mean=5.417, max=5.417, sum=5.417 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.7122931517101486,
          "description": "min=0.409, mean=0.712, max=1.079, sum=3.561 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.9142626611660299,
          "description": "min=0.914, mean=0.914, max=0.914, sum=0.914 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.7586197336965614,
          "description": "min=0.723, mean=0.759, max=0.81, sum=3.793 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Jumbo (178B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.20549450549450549,
          "markdown": false
        },
        {
          "value": 1.8203622415032186,
          "description": "min=1.82, mean=1.82, max=1.82, sum=1.82 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.4479399914741515,
          "description": "min=1.448, mean=1.448, max=1.448, sum=1.448 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.3321147253513335,
          "description": "min=5.332, mean=5.332, max=5.332, sum=5.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.9981746392250062,
          "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.8103257050430566,
          "description": "min=0.693, mean=0.81, max=0.92, sum=4.052 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 9.135811412885502,
          "description": "min=4.497, mean=9.136, max=13.531, sum=63.951 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.176425676584244,
          "description": "min=5.176, mean=5.176, max=5.176, sum=5.176 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.2737073742826783,
          "description": "min=0.639, mean=1.274, max=2.827, sum=6.369 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.5350148075854566,
          "description": "min=1.535, mean=1.535, max=1.535, sum=1.535 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.4411698855373092,
          "description": "min=1.236, mean=1.441, max=1.665, sum=7.206 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        }
      ],
      [
        {
          "value": "Jamba Instruct",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6800699300699301,
          "markdown": false
        },
        {
          "value": 0.9470622405199938,
          "description": "min=0.947, mean=0.947, max=0.947, sum=0.947 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.8087365460395813,
          "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.5348668487071991,
          "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.30006033515930175,
          "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.2654710942151254,
          "description": "min=0.253, mean=0.265, max=0.275, sum=1.327 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 3.24175411841349,
          "description": "min=1.917, mean=3.242, max=5.09, sum=22.692 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 3.8455032846927644,
          "description": "min=3.846, mean=3.846, max=3.846, sum=3.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 0.6408480782672099,
          "description": "min=0.351, mean=0.641, max=1.337, sum=3.204 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.31133864366747516,
          "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.6354023076110767,
          "description": "min=0.586, mean=0.635, max=0.686, sum=2.542 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Mini",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.47021312021312023,
          "markdown": false
        },
        {
          "value": 0.9981950746455662,
          "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.9243871104717255,
          "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.8436705965995789,
          "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.7863723936080933,
          "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.8097888966024968,
          "description": "min=0.783, mean=0.81, max=0.83, sum=4.049 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1.63604986000122,
          "description": "min=1.462, mean=1.636, max=2.034, sum=11.452 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1.8916997435092926,
          "description": "min=1.892, mean=1.892, max=1.892, sum=1.892 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 0.8644844750252041,
          "description": "min=0.805, mean=0.864, max=1.071, sum=4.322 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.8172814860258615,
          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.9776749755042665,
          "description": "min=0.965, mean=0.978, max=0.99, sum=4.888 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Large",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2787712287712288,
          "markdown": false
        },
        {
          "value": 1.9694313982842673,
          "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1.678127991437912,
          "description": "min=1.678, mean=1.678, max=1.678, sum=1.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1.2717866213321687,
          "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.9100792293548584,
          "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.973254363085094,
          "description": "min=0.933, mean=0.973, max=1.0, sum=4.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 3.1790229759699775,
          "description": "min=2.366, mean=3.179, max=4.736, sum=22.253 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 3.942030364751816,
          "description": "min=3.942, mean=3.942, max=3.942, sum=3.942 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 1.2577736545740559,
          "description": "min=0.933, mean=1.258, max=2.367, sum=6.289 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.9989562840395372,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1.3859240114613673,
          "description": "min=1.317, mean=1.386, max=1.471, sum=6.93 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        }
      ],
      [
        {
          "value": "Luminous Base (13B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.3115884115884116,
          "markdown": false
        },
        {
          "value": 1.05044368958809,
          "description": "min=1.05, mean=1.05, max=1.05, sum=1.05 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.328731627702713,
          "description": "min=1.329, mean=1.329, max=1.329, sum=1.329 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.8020290625095368,
          "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.6669360423088073,
          "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.6324507230122884,
          "description": "min=0.619, mean=0.632, max=0.648, sum=3.162 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 9.203530075671766,
          "description": "min=5.282, mean=9.204, max=20.088, sum=64.425 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 16.42652773284912,
          "description": "min=16.427, mean=16.427, max=16.427, sum=16.427 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.7533007583490331,
          "description": "min=0.636, mean=0.753, max=1.073, sum=3.767 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.7258754989972882,
          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 4.692985351748752,
          "description": "min=4.671, mean=4.693, max=4.731, sum=23.465 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        }
      ],
      [
        {
          "value": "Luminous Extended (30B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.24105894105894107,
          "markdown": false
        },
        {
          "value": 1.4667296523779212,
          "description": "min=1.467, mean=1.467, max=1.467, sum=1.467 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.777582576751709,
          "description": "min=1.778, mean=1.778, max=1.778, sum=1.778 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.9799906523227692,
          "description": "min=0.98, mean=0.98, max=0.98, sum=0.98 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.6750410146713257,
          "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.7183412402554562,
          "description": "min=0.69, mean=0.718, max=0.754, sum=3.592 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 9.364456500699777,
          "description": "min=5.96, mean=9.364, max=12.108, sum=65.551 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 22.685439155817033,
          "description": "min=22.685, mean=22.685, max=22.685, sum=22.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.8581969152200717,
          "description": "min=0.7, mean=0.858, max=1.261, sum=4.291 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.8947408758622277,
          "description": "min=0.895, mean=0.895, max=0.895, sum=0.895 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.33597646673717,
          "description": "min=5.231, mean=5.336, max=5.406, sum=26.68 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        }
      ],
      [
        {
          "value": "Luminous Supreme (70B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.13847818847818846,
          "markdown": false
        },
        {
          "value": 2.9511526873413945,
          "description": "min=2.951, mean=2.951, max=2.951, sum=2.951 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 2.656584274530411,
          "description": "min=2.657, mean=2.657, max=2.657, sum=2.657 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.2722365505695343,
          "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.778845920085907,
          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.9073754794472141,
          "description": "min=0.825, mean=0.907, max=1.009, sum=4.537 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 16.873623512856078,
          "description": "min=13.143, mean=16.874, max=20.77, sum=118.115 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 48.241569149971006,
          "description": "min=48.242, mean=48.242, max=48.242, sum=48.242 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.1561943690304337,
          "description": "min=0.84, mean=1.156, max=2.035, sum=5.781 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.325726029887114,
          "description": "min=1.326, mean=1.326, max=1.326, sum=1.326 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 11.052006985892152,
          "description": "min=10.924, mean=11.052, max=11.265, sum=55.26 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        }
      ],
      [
        {
          "value": "Claude v1.3",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.09766899766899767,
          "markdown": false
        },
        {
          "value": 6.113923052666893,
          "description": "min=6.114, mean=6.114, max=6.114, sum=6.114 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3.5226667501174913,
          "description": "min=3.523, mean=3.523, max=3.523, sum=3.523 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 2.0589215233325957,
          "description": "min=2.059, mean=2.059, max=2.059, sum=2.059 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3.375496371269226,
          "description": "min=3.375, mean=3.375, max=3.375, sum=3.375 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.4820951028288456,
          "description": "min=1.228, mean=1.482, max=1.741, sum=7.41 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 6.10879439056091,
          "description": "min=3.85, mean=6.109, max=8.225, sum=42.762 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 6.653211696863174,
          "description": "min=6.653, mean=6.653, max=6.653, sum=6.653 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3.536136101917547,
          "description": "min=1.081, mean=3.536, max=8.614, sum=17.681 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3.3901417141643244,
          "description": "min=3.39, mean=3.39, max=3.39, sum=3.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 2.232213549153336,
          "description": "min=1.391, mean=2.232, max=3.755, sum=11.161 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        }
      ],
      [
        {
          "value": "Claude Instant 1.2",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5229437229437229,
          "markdown": false
        },
        {
          "value": 1.490500447447871,
          "description": "min=1.491, mean=1.491, max=1.491, sum=1.491 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.9746438981543135,
          "description": "min=0.975, mean=0.975, max=0.975, sum=0.975 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.6736472499370575,
          "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.596853446483612,
          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.613885824571576,
          "description": "min=0.59, mean=0.614, max=0.636, sum=3.069 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.4029501960147133,
          "description": "min=1.247, mean=1.403, max=1.528, sum=9.821 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.474282945394516,
          "description": "min=1.474, mean=1.474, max=1.474, sum=1.474 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.9110085331512334,
          "description": "min=0.629, mean=0.911, max=1.974, sum=4.555 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.7633721221749399,
          "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.7717107724915095,
          "description": "min=0.726, mean=0.772, max=0.838, sum=3.859 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.0",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.15685980685980686,
          "markdown": false
        },
        {
          "value": 4.8114360809326175,
          "description": "min=4.811, mean=4.811, max=4.811, sum=4.811 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 2.9841483016268606,
          "description": "min=2.984, mean=2.984, max=2.984, sum=2.984 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.1486653406620027,
          "description": "min=1.149, mean=1.149, max=1.149, sum=1.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.5584912838935852,
          "description": "min=1.558, mean=1.558, max=1.558, sum=1.558 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.7282055348597072,
          "description": "min=1.609, mean=1.728, max=1.936, sum=8.641 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 6.211058685420826,
          "description": "min=5.057, mean=6.211, max=7.33, sum=43.477 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 4.857238686800003,
          "description": "min=4.857, mean=4.857, max=4.857, sum=4.857 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 2.782158235233088,
          "description": "min=1.703, mean=2.782, max=6.2, sum=13.911 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 2.2539968865055213,
          "description": "min=2.254, mean=2.254, max=2.254, sum=2.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.9951115173159082,
          "description": "min=1.692, mean=1.995, max=2.443, sum=9.976 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.1",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.08476523476523476,
          "markdown": false
        },
        {
          "value": 5.376147254755799,
          "description": "min=5.376, mean=5.376, max=5.376, sum=5.376 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 4.16052336707216,
          "description": "min=4.161, mean=4.161, max=4.161, sum=4.161 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.753281570672989,
          "description": "min=1.753, mean=1.753, max=1.753, sum=1.753 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.8090401072502136,
          "description": "min=1.809, mean=1.809, max=1.809, sum=1.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 2.370939975420634,
          "description": "min=2.043, mean=2.371, max=2.615, sum=11.855 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 9.671810739168015,
          "description": "min=9.158, mean=9.672, max=10.737, sum=67.703 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 7.7061755385398865,
          "description": "min=7.706, mean=7.706, max=7.706, sum=7.706 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 3.2225898594048035,
          "description": "min=2.23, mean=3.223, max=6.58, sum=16.113 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 2.482170646754695,
          "description": "min=2.482, mean=2.482, max=2.482, sum=2.482 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 2.7559348208894425,
          "description": "min=2.478, mean=2.756, max=3.455, sum=13.78 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Haiku (20240307)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5678987678987679,
          "markdown": false
        },
        {
          "value": 1.1334171402622277,
          "description": "min=1.133, mean=1.133, max=1.133, sum=1.133 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.9411524205207825,
          "description": "min=0.941, mean=0.941, max=0.941, sum=0.941 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.8646892714500427,
          "description": "min=0.865, mean=0.865, max=0.865, sum=0.865 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.6164444308280945,
          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.6970766685050831,
          "description": "min=0.686, mean=0.697, max=0.721, sum=3.485 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.8950275982044664,
          "description": "min=0.672, mean=0.895, max=1.288, sum=6.265 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1.2278449382781982,
          "description": "min=1.228, mean=1.228, max=1.228, sum=1.228 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.7186767522236834,
          "description": "min=0.455, mean=0.719, max=0.988, sum=3.593 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.6529203475588121,
          "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.7111122513056886,
          "description": "min=0.627, mean=0.711, max=0.891, sum=3.556 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Sonnet (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.29172494172494173,
          "markdown": false
        },
        {
          "value": 2.2392607588163562,
          "description": "min=2.239, mean=2.239, max=2.239, sum=2.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.828468058347702,
          "description": "min=1.828, mean=1.828, max=1.828, sum=1.828 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.2262272393703462,
          "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.031575677871704,
          "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.2781797420267473,
          "description": "min=1.228, mean=1.278, max=1.341, sum=6.391 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 2.3301560711519222,
          "description": "min=2.092, mean=2.33, max=2.633, sum=16.311 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 3.2127642614841463,
          "description": "min=3.213, mean=3.213, max=3.213, sum=3.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.3159105889028733,
          "description": "min=0.683, mean=1.316, max=2.689, sum=6.58 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.1428523476033752,
          "description": "min=1.143, mean=1.143, max=1.143, sum=1.143 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.1393479201068188,
          "description": "min=1.066, mean=1.139, max=1.228, sum=5.697 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Opus (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.09375624375624375,
          "markdown": false
        },
        {
          "value": 3.9963467248728577,
          "description": "min=3.996, mean=3.996, max=3.996, sum=3.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 4.273005393266678,
          "description": "min=4.273, mean=4.273, max=4.273, sum=4.273 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.6471402559280395,
          "description": "min=1.647, mean=1.647, max=1.647, sum=1.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 2.167769320487976,
          "description": "min=2.168, mean=2.168, max=2.168, sum=2.168 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 4.189554240862528,
          "description": "min=4.003, mean=4.19, max=4.373, sum=20.948 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 7.541890628266922,
          "description": "min=6.095, mean=7.542, max=9.041, sum=52.793 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 7.469249876976013,
          "description": "min=7.469, mean=7.469, max=7.469, sum=7.469 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 2.570133829482505,
          "description": "min=1.391, mean=2.57, max=4.856, sum=12.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 2.6499544673601156,
          "description": "min=2.65, mean=2.65, max=2.65, sum=2.65 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 2.4465377724275283,
          "description": "min=2.279, mean=2.447, max=2.661, sum=12.233 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.5 Sonnet (20240620)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2879120879120879,
          "markdown": false
        },
        {
          "value": 3.5003784911733278,
          "description": "min=3.5, mean=3.5, max=3.5, sum=3.5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.8338699455261231,
          "description": "min=1.834, mean=1.834, max=1.834, sum=1.834 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.738832370519638,
          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.7740971641540527,
          "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.8242833791364703,
          "description": "min=0.765, mean=0.824, max=0.973, sum=4.121 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 3.0116338881061275,
          "description": "min=2.231, mean=3.012, max=3.921, sum=21.081 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 3.162740940093994,
          "description": "min=3.163, mean=3.163, max=3.163, sum=3.163 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.473749651523724,
          "description": "min=0.66, mean=1.474, max=4.297, sum=7.369 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.1990809397953406,
          "description": "min=1.199, mean=1.199, max=1.199, sum=1.199 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.9232725335746241,
          "description": "min=1.838, mean=1.923, max=2.007, sum=9.616 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        }
      ],
      [
        {
          "value": "Command",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.23258408258408259,
          "markdown": false
        },
        {
          "value": 1.783306110408944,
          "description": "min=1.783, mean=1.783, max=1.783, sum=1.783 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 1.8040301027297974,
          "description": "min=1.804, mean=1.804, max=1.804, sum=1.804 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 0.9856750283241272,
          "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 1.0440752515792846,
          "description": "min=1.044, mean=1.044, max=1.044, sum=1.044 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 1.0797608851633573,
          "description": "min=0.821, mean=1.08, max=1.384, sum=5.399 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 5.762416239357385,
          "description": "min=4.562, mean=5.762, max=6.509, sum=40.337 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 4.127378141641617,
          "description": "min=4.127, mean=4.127, max=4.127, sum=4.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 1.1646721122881132,
          "description": "min=0.856, mean=1.165, max=1.842, sum=5.823 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 1.2344102347584416,
          "description": "min=1.234, mean=1.234, max=1.234, sum=1.234 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 2.8937741082134893,
          "description": "min=2.376, mean=2.894, max=3.133, sum=14.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        }
      ],
      [
        {
          "value": "Command Light",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.49335664335664337,
          "markdown": false
        },
        {
          "value": 0.8961316760157195,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1.0799305574893951,
          "description": "min=1.08, mean=1.08, max=1.08, sum=1.08 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 0.6957695767879486,
          "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 0.7049956932067871,
          "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 0.7494988910942747,
          "description": "min=0.405, mean=0.749, max=1.412, sum=3.747 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 2.374249639604042,
          "description": "min=1.821, mean=2.374, max=2.948, sum=16.62 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 1.7514978868961335,
          "description": "min=1.751, mean=1.751, max=1.751, sum=1.751 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 0.7831334660572837,
          "description": "min=0.423, mean=0.783, max=1.232, sum=3.916 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 0.895831539901066,
          "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 0.7965989762712353,
          "description": "min=0.712, mean=0.797, max=0.934, sum=3.983 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        }
      ],
      [
        {
          "value": "Command R",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.991008991008991,
          "style": {
            "font-weight": "bold"
          },
          "markdown": false
        },
        {
          "value": 0.3886059089445732,
          "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.2875482747554779,
          "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 0.16523362946510314,
          "description": "min=0.165, mean=0.165, max=0.165, sum=0.165 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 0.14960159301757814,
          "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.17335561692923832,
          "description": "min=0.162, mean=0.173, max=0.185, sum=0.867 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.8207379439676702,
          "description": "min=0.659, mean=0.821, max=1.104, sum=5.745 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 1.0398468203544617,
          "description": "min=1.04, mean=1.04, max=1.04, sum=1.04 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 0.23478191454837286,
          "description": "min=0.151, mean=0.235, max=0.5, sum=1.174 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 0.19128861531585634,
          "description": "min=0.191, mean=0.191, max=0.191, sum=0.191 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.3429552388299011,
          "description": "min=0.308, mean=0.343, max=0.455, sum=1.715 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        }
      ],
      [
        {
          "value": "Command R Plus",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7249916749916749,
          "markdown": false
        },
        {
          "value": 0.6590185803426823,
          "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.48011646389961243,
          "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.21743906450271605,
          "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.5261325912475586,
          "description": "min=0.526, mean=0.526, max=0.526, sum=0.526 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.3594088048349347,
          "description": "min=0.26, mean=0.359, max=0.481, sum=1.797 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1.7917883168992628,
          "description": "min=1.358, mean=1.792, max=2.877, sum=12.543 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 3.5923334171772003,
          "description": "min=3.592, mean=3.592, max=3.592, sum=3.592 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 0.3508069759610481,
          "description": "min=0.193, mean=0.351, max=0.927, sum=1.754 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.6308214294744533,
          "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.6441886008863676,
          "description": "min=0.59, mean=0.644, max=0.742, sum=3.221 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.0 Pro (002)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6742091242091242,
          "markdown": false
        },
        {
          "value": 0.6791302858934104,
          "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.6086829407215119,
          "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.5965619602203369,
          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.4301223816871643,
          "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.4066482855060644,
          "description": "min=0.397, mean=0.407, max=0.417, sum=2.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1.5848151401531698,
          "description": "min=1.402, mean=1.585, max=2.083, sum=11.094 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1.513066102743149,
          "description": "min=1.513, mean=1.513, max=1.513, sum=1.513 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 0.6085789782066453,
          "description": "min=0.447, mean=0.609, max=1.08, sum=3.043 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.4310008814610333,
          "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.8027491282517494,
          "description": "min=0.705, mean=0.803, max=0.924, sum=4.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5096903096903097,
          "markdown": false
        },
        {
          "value": 0.8351484166930544,
          "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.7170397922992706,
          "description": "min=0.717, mean=0.717, max=0.717, sum=0.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6341883151531219,
          "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6239193634986877,
          "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.6902154895882857,
          "description": "min=0.65, mean=0.69, max=0.763, sum=3.451 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 2.701360058859101,
          "description": "min=2.006, mean=2.701, max=3.274, sum=18.91 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3.205789808034897,
          "description": "min=3.206, mean=3.206, max=3.206, sum=3.206 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 0.7752882438000996,
          "description": "min=0.577, mean=0.775, max=1.078, sum=3.876 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.5296737767785669,
          "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.1399874632845124,
          "description": "min=1.029, mean=1.14, max=1.4, sum=5.7 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Flash (001)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.7151348651348651,
          "markdown": false
        },
        {
          "value": 0.6474363112991507,
          "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.49524100852012637,
          "description": "min=0.495, mean=0.495, max=0.495, sum=0.495 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.431587886095047,
          "description": "min=0.432, mean=0.432, max=0.432, sum=0.432 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.5038927392959595,
          "description": "min=0.504, mean=0.504, max=0.504, sum=0.504 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.5683523873948214,
          "description": "min=0.525, mean=0.568, max=0.62, sum=2.842 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1.592031592636459,
          "description": "min=1.303, mean=1.592, max=2.086, sum=11.144 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1.7575640678405762,
          "description": "min=1.758, mean=1.758, max=1.758, sum=1.758 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 0.6040551961526522,
          "description": "min=0.409, mean=0.604, max=0.842, sum=3.02 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.3993651843165971,
          "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.6372637821067911,
          "description": "min=0.581, mean=0.637, max=0.75, sum=3.186 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Bison)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5011655011655012,
          "markdown": false
        },
        {
          "value": 1.030712524602111,
          "description": "min=1.031, mean=1.031, max=1.031, sum=1.031 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.987217092037201,
          "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.754590849161148,
          "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.7879144654273987,
          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.1122005350882547,
          "description": "min=1.017, mean=1.112, max=1.352, sum=5.561 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.6140828338918989,
          "description": "min=1.161, mean=1.614, max=2.126, sum=11.299 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.4403084371089936,
          "description": "min=1.44, mean=1.44, max=1.44, sum=1.44 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.7366328867537384,
          "description": "min=0.53, mean=0.737, max=1.325, sum=3.683 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.7348999071784806,
          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.8753595397700126,
          "description": "min=0.826, mean=0.875, max=0.952, sum=4.377 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Unicorn)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.19132534132534132,
          "markdown": false
        },
        {
          "value": 3.283053755424392,
          "description": "min=3.283, mean=3.283, max=3.283, sum=3.283 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 2.564493465423584,
          "description": "min=2.564, mean=2.564, max=2.564, sum=2.564 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.5603588831424713,
          "description": "min=1.56, mean=1.56, max=1.56, sum=1.56 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.9994440112113953,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.2620431824148748,
          "description": "min=1.198, mean=1.262, max=1.332, sum=6.31 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 4.636334307701402,
          "description": "min=4.016, mean=4.636, max=5.654, sum=32.454 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.4373185629844665,
          "description": "min=5.437, mean=5.437, max=5.437, sum=5.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.4374773445647835,
          "description": "min=0.859, mean=1.437, max=3.198, sum=7.187 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.1783231205305096,
          "description": "min=1.178, mean=1.178, max=1.178, sum=1.178 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.801295139912888,
          "description": "min=1.706, mean=1.801, max=1.909, sum=9.006 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        }
      ],
      [
        {
          "value": "Yi Large (Preview)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.18981018981018982,
          "markdown": false
        },
        {
          "value": 2.6724000897206053,
          "description": "min=2.672, mean=2.672, max=2.672, sum=2.672 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 2.506305232524872,
          "description": "min=2.506, mean=2.506, max=2.506, sum=2.506 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1.0360134015083313,
          "description": "min=1.036, mean=1.036, max=1.036, sum=1.036 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.77673295545578,
          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.7133434140138459,
          "description": "min=0.679, mean=0.713, max=0.752, sum=3.567 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 11.510960669458308,
          "description": "min=8.67, mean=11.511, max=13.559, sum=80.577 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 13.45040065407753,
          "description": "min=13.45, mean=13.45, max=13.45, sum=13.45 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 1.471592522464795,
          "description": "min=0.855, mean=1.472, max=3.502, sum=7.358 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.9931588552107157,
          "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 2.095412739007152,
          "description": "min=1.838, mean=2.095, max=2.409, sum=10.477 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        }
      ],
      [
        {
          "value": "Mistral Small (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4148518148518149,
          "markdown": false
        },
        {
          "value": 0.947719474577568,
          "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.384453837633133,
          "description": "min=1.384, mean=1.384, max=1.384, sum=1.384 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.4422871778011321,
          "description": "min=1.442, mean=1.442, max=1.442, sum=1.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.5299914984703064,
          "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.2616501861371492,
          "description": "min=1.022, mean=1.262, max=1.477, sum=6.308 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 2.216904607788028,
          "description": "min=1.895, mean=2.217, max=2.662, sum=15.518 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 2.9720949590206147,
          "description": "min=2.972, mean=2.972, max=2.972, sum=2.972 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.8738773620338431,
          "description": "min=0.609, mean=0.874, max=1.067, sum=4.369 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.4987720272413068,
          "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.1885517670659458,
          "description": "min=0.945, mean=1.189, max=1.429, sum=5.943 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Medium (2312)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.07064602064602064,
          "markdown": false
        },
        {
          "value": 3.898151301666045,
          "description": "min=3.898, mean=3.898, max=3.898, sum=3.898 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.342489146232605,
          "description": "min=5.342, mean=5.342, max=5.342, sum=5.342 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 6.588117929935455,
          "description": "min=6.588, mean=6.588, max=6.588, sum=6.588 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 2.1195812821388245,
          "description": "min=2.12, mean=2.12, max=2.12, sum=2.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 2.774717758923246,
          "description": "min=1.507, mean=2.775, max=3.62, sum=13.874 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 7.0860357509079535,
          "description": "min=6.1, mean=7.086, max=10.207, sum=49.602 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 9.718977437496186,
          "description": "min=9.719, mean=9.719, max=9.719, sum=9.719 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 3.248400288401771,
          "description": "min=2.695, mean=3.248, max=3.795, sum=16.242 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 2.813041030531138,
          "description": "min=2.813, mean=2.813, max=2.813, sum=2.813 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 4.9482336292575715,
          "description": "min=3.982, mean=4.948, max=6.067, sum=24.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.2501831501831502,
          "markdown": false
        },
        {
          "value": 1.6750120075655655,
          "description": "min=1.675, mean=1.675, max=1.675, sum=1.675 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.665770656108856,
          "description": "min=1.666, mean=1.666, max=1.666, sum=1.666 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 2.1218616259098053,
          "description": "min=2.122, mean=2.122, max=2.122, sum=2.122 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.5687967395782471,
          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.4514196366845515,
          "description": "min=1.226, mean=1.451, max=1.633, sum=7.257 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.128044104863146,
          "description": "min=3.885, mean=5.128, max=5.812, sum=35.896 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 7.095049407720566,
          "description": "min=7.095, mean=7.095, max=7.095, sum=7.095 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.6924799473534797,
          "description": "min=0.985, mean=1.692, max=2.787, sum=8.462 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.5950325303238856,
          "description": "min=0.595, mean=0.595, max=0.595, sum=0.595 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.969239294333439,
          "description": "min=1.69, mean=1.969, max=2.702, sum=9.846 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large 2 (2407)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.44262404262404265,
          "markdown": false
        },
        {
          "value": 0.7276979574015443,
          "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.7573216142654419,
          "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.5273597676753998,
          "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.8910596170425416,
          "description": "min=0.891, mean=0.891, max=0.891, sum=0.891 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.7886472435834114,
          "description": "min=0.684, mean=0.789, max=0.933, sum=3.943 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.441067432619708,
          "description": "min=4.359, mean=5.441, max=6.464, sum=38.087 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.431343378543854,
          "description": "min=5.431, mean=5.431, max=5.431, sum=5.431 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 0.7974768901406878,
          "description": "min=0.485, mean=0.797, max=0.986, sum=3.987 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.4456319799480097,
          "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1.2686868536542282,
          "description": "min=1.075, mean=1.269, max=1.402, sum=6.343 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        }
      ],
      [
        {
          "value": "Mistral NeMo (2402)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5563103563103563,
          "markdown": false
        },
        {
          "value": 0.7111437549053783,
          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.851971923828125,
          "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.8765462198257447,
          "description": "min=0.877, mean=0.877, max=0.877, sum=0.877 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.7987758111953736,
          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.7815720957371226,
          "description": "min=0.635, mean=0.782, max=1.011, sum=3.908 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.0132869822173503,
          "description": "min=0.866, mean=1.013, max=1.281, sum=7.093 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.4254731934070588,
          "description": "min=1.425, mean=1.425, max=1.425, sum=1.425 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 0.7795765090728288,
          "description": "min=0.715, mean=0.78, max=0.868, sum=3.898 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.7488490715178533,
          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.7821908106898373,
          "description": "min=0.752, mean=0.782, max=0.819, sum=3.911 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-003)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.6131701631701632,
          "markdown": false
        },
        {
          "value": 1.812959625351597,
          "description": "min=1.813, mean=1.813, max=1.813, sum=1.813 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.1872664585113526,
          "description": "min=1.187, mean=1.187, max=1.187, sum=1.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.9963206455707551,
          "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.20436767482757567,
          "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.2000334782098469,
          "description": "min=0.199, mean=0.2, max=0.203, sum=1.0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 4.333955165715466,
          "description": "min=3.871, mean=4.334, max=5.181, sum=30.338 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.199419307470322,
          "description": "min=5.199, mean=5.199, max=5.199, sum=5.199 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.2594051892596125,
          "description": "min=0.189, mean=0.259, max=0.474, sum=1.297 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.22811962975185388,
          "description": "min=0.228, mean=0.228, max=0.228, sum=0.228 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.800053899013968,
          "description": "min=0.756, mean=0.8, max=0.822, sum=4.0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-002)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.715984015984016,
          "markdown": false
        },
        {
          "value": 1.2258358747186795,
          "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.8863302536010742,
          "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.6834516413211823,
          "description": "min=0.683, mean=0.683, max=0.683, sum=0.683 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.1743956871032715,
          "description": "min=0.174, mean=0.174, max=0.174, sum=0.174 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.17730724048614502,
          "description": "min=0.175, mean=0.177, max=0.181, sum=0.887 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.188020693120383,
          "description": "min=3.257, mean=5.188, max=9.459, sum=36.316 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 3.762208682537079,
          "description": "min=3.762, mean=3.762, max=3.762, sum=3.762 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.2229105462585103,
          "description": "min=0.167, mean=0.223, max=0.403, sum=1.115 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.20554606720183052,
          "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.4672719452194591,
          "description": "min=0.446, mean=0.467, max=0.478, sum=2.336 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 Turbo (0613)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.9833333333333334,
          "markdown": false
        },
        {
          "value": 0.3810261323418416,
          "description": "min=0.381, mean=0.381, max=0.381, sum=0.381 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.30532183837890625,
          "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.22069251775741577,
          "description": "min=0.221, mean=0.221, max=0.221, sum=0.221 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.17227248001098633,
          "description": "min=0.172, mean=0.172, max=0.172, sum=0.172 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.1750619323630082,
          "description": "min=0.171, mean=0.175, max=0.177, sum=0.875 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.8128212395123947,
          "description": "min=0.741, mean=0.813, max=0.963, sum=5.69 (7)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.8983073465824127,
          "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.20213919553681423,
          "description": "min=0.178, mean=0.202, max=0.277, sum=1.011 (5)",
          "style": {
            "font-weight": "bold"
          },
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.19374941736755977,
          "description": "min=0.194, mean=0.194, max=0.194, sum=0.194 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.39351808213963385,
          "description": "min=0.367, mean=0.394, max=0.409, sum=1.968 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (1106 preview)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.41694971694971694,
          "markdown": false
        },
        {
          "value": 1.068114177945634,
          "description": "min=1.068, mean=1.068, max=1.068, sum=1.068 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.8667134034633637,
          "description": "min=0.867, mean=0.867, max=0.867, sum=0.867 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.1312835423946381,
          "description": "min=1.131, mean=1.131, max=1.131, sum=1.131 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.5122070140838623,
          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.4471675806380155,
          "description": "min=0.397, mean=0.447, max=0.515, sum=2.236 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 12.704059314714486,
          "description": "min=10.989, mean=12.704, max=15.09, sum=88.928 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.738402992963791,
          "description": "min=5.738, mean=5.738, max=5.738, sum=5.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.6033123332286346,
          "description": "min=0.445, mean=0.603, max=0.98, sum=3.017 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.3924491192190121,
          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 2.1004491326059744,
          "description": "min=1.797, mean=2.1, max=2.349, sum=10.502 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 (0613)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5414418914418915,
          "markdown": false
        },
        {
          "value": 0.9758186582108619,
          "description": "min=0.976, mean=0.976, max=0.976, sum=0.976 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.9083020164966583,
          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.5116857671737671,
          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.40061268854141235,
          "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.39080846048656265,
          "description": "min=0.364, mean=0.391, max=0.434, sum=1.954 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 3.4718795228507955,
          "description": "min=2.95, mean=3.472, max=4.247, sum=24.303 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 4.947624314308166,
          "description": "min=4.948, mean=4.948, max=4.948, sum=4.948 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.5582764348578453,
          "description": "min=0.46, mean=0.558, max=0.886, sum=2.791 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.4136932588239787,
          "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.5797039644192494,
          "description": "min=1.448, mean=1.58, max=1.724, sum=7.899 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (2024-04-09)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.4861138861138861,
          "markdown": false
        },
        {
          "value": 0.8043310716118611,
          "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.7120162718296051,
          "description": "min=0.712, mean=0.712, max=0.712, sum=0.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.6052222681045533,
          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.4376141686439514,
          "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.5498773384847139,
          "description": "min=0.53, mean=0.55, max=0.572, sum=2.749 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 6.678270916932833,
          "description": "min=4.92, mean=6.678, max=8.338, sum=46.748 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 6.91472976398468,
          "description": "min=6.915, mean=6.915, max=6.915, sum=6.915 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 0.6081070231398068,
          "description": "min=0.514, mean=0.608, max=0.803, sum=3.041 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.4549296101329341,
          "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.1850423664020953,
          "description": "min=1.131, mean=1.185, max=1.222, sum=5.925 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.656909756909757,
          "markdown": false
        },
        {
          "value": 0.5561933571184186,
          "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.5071200861930847,
          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.46105142664909365,
          "description": "min=0.461, mean=0.461, max=0.461, sum=0.461 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4019911346435547,
          "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.3904274333485386,
          "description": "min=0.353, mean=0.39, max=0.416, sum=1.952 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.357550465458739,
          "description": "min=3.334, mean=4.358, max=4.85, sum=30.503 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.227096201658249,
          "description": "min=4.227, mean=4.227, max=4.227, sum=4.227 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 0.4307274274560104,
          "description": "min=0.36, mean=0.431, max=0.568, sum=2.154 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.4072816490416024,
          "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.8424805298775759,
          "description": "min=0.775, mean=0.842, max=0.967, sum=4.212 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.805977355977356,
          "markdown": false
        },
        {
          "value": 0.47311924612018424,
          "description": "min=0.473, mean=0.473, max=0.473, sum=0.473 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.40617332768440245,
          "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.3740478873252869,
          "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.3309546322822571,
          "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.2993013315033494,
          "description": "min=0.292, mean=0.299, max=0.309, sum=1.497 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3.175392215033706,
          "description": "min=2.312, mean=3.175, max=3.696, sum=22.228 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 2.5191967821121217,
          "description": "min=2.519, mean=2.519, max=2.519, sum=2.519 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 0.38199841220513264,
          "description": "min=0.337, mean=0.382, max=0.503, sum=1.91 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.3318999989132284,
          "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.5833699647787834,
          "description": "min=0.557, mean=0.583, max=0.598, sum=2.917 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V2 (33B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.5316017316017316,
          "markdown": false
        },
        {
          "value": 1.2016644296511798,
          "description": "min=1.202, mean=1.202, max=1.202, sum=1.202 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.9688332653045655,
          "description": "min=0.969, mean=0.969, max=0.969, sum=0.969 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.6202523970603943,
          "description": "min=0.62, mean=0.62, max=0.62, sum=0.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.4200127201080322,
          "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.5321985618859008,
          "description": "min=0.462, mean=0.532, max=0.577, sum=2.661 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 2.0883775065675723,
          "description": "min=1.722, mean=2.088, max=2.676, sum=14.619 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 2.543274956703186,
          "description": "min=2.543, mean=2.543, max=2.543, sum=2.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.7313747247589137,
          "description": "min=0.425, mean=0.731, max=1.784, sum=3.657 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.6051040529967776,
          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.904815991352295,
          "description": "min=0.83, mean=0.905, max=0.948, sum=4.524 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V3 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.27217782217782216,
          "markdown": false
        },
        {
          "value": 2.848917615245765,
          "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 2.31904000210762,
          "description": "min=2.319, mean=2.319, max=2.319, sum=2.319 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 2.3729000978469847,
          "description": "min=2.373, mean=2.373, max=2.373, sum=2.373 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.6074039902687073,
          "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.656667516515966,
          "description": "min=0.604, mean=0.657, max=0.783, sum=3.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 4.258683228698372,
          "description": "min=3.23, mean=4.259, max=6.331, sum=29.811 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.069576686620712,
          "description": "min=5.07, mean=5.07, max=5.07, sum=5.07 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.1595191393847304,
          "description": "min=0.668, mean=1.16, max=3.0, sum=5.798 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.9268994279220611,
          "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.4063038200537652,
          "description": "min=1.32, mean=1.406, max=1.477, sum=7.032 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        }
      ],
      [
        {
          "value": "Palmyra-X-004",
          "description": "",
          "markdown": false
        },
        {
          "value": 0.41934731934731934,
          "markdown": false
        },
        {
          "value": 1.634409177135414,
          "description": "min=1.634, mean=1.634, max=1.634, sum=1.634 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1.22119681596756,
          "description": "min=1.221, mean=1.221, max=1.221, sum=1.221 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1.2129934797286988,
          "description": "min=1.213, mean=1.213, max=1.213, sum=1.213 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.2705215420722961,
          "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.39635124337045774,
          "description": "min=0.309, mean=0.396, max=0.722, sum=1.982 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 14.82662017363065,
          "description": "min=5.13, mean=14.827, max=45.729, sum=103.786 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 11.449529441833496,
          "description": "min=11.45, mean=11.45, max=11.45, sum=11.45 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.5037181089898329,
          "description": "min=0.478, mean=0.504, max=0.522, sum=2.519 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.39942375139498093,
          "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 2.045695114985284,
          "description": "min=1.801, mean=2.046, max=2.515, sum=10.228 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v1.9.0/groups/latex/core_scenarios_efficiency.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v1.9.0/groups/json/core_scenarios_efficiency.json"
      }
    ],
    "name": "efficiency"
  },
  {
    "title": "General information",
    "header": [
      {
        "value": "Model/adapter",
        "markdown": false,
        "metadata": {}
      },
      {
        "value": "Mean win rate",
        "description": "How many models this model outperforms on average (over columns).",
        "markdown": false,
        "lower_is_better": false,
        "metadata": {}
      },
      {
        "value": "NarrativeQA - # eval",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NarrativeQA - # train",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NarrativeQA - truncated",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NarrativeQA - # prompt tokens",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NarrativeQA - # output tokens",
        "description": "The NarrativeQA benchmark for reading comprehension over narratives [(Ko\u010disk\u00fd et al., 2017)](https://aclanthology.org/Q18-1023/).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "NarrativeQA"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - # eval",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - # train",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - truncated",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - # prompt tokens",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (open-book) - # output tokens",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input includes the Wikipedia page with the answer.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "NaturalQuestions (open-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - # eval",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - # train",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - truncated",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - # prompt tokens",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "NaturalQuestions (closed-book) - # output tokens",
        "description": "The NaturalQuestions [(Kwiatkowski et al., 2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering based on naturally-occurring queries through Google Search. The input does not include the Wikipedia page with the answer.\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "NaturalQuestions (closed-book)"
        }
      },
      {
        "value": "OpenbookQA - # eval",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "OpenbookQA - # train",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "OpenbookQA - truncated",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "OpenbookQA - # prompt tokens",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "OpenbookQA - # output tokens",
        "description": "The OpenbookQA benchmark for commonsense-intensive open book question answering [(Mihaylov et al., 2018)](https://aclanthology.org/D18-1260/).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "OpenbookQA"
        }
      },
      {
        "value": "MMLU - # eval",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MMLU - # train",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MMLU - truncated",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MMLU - # prompt tokens",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MMLU - # output tokens",
        "description": "The Massive Multitask Language Understanding (MMLU) benchmark for knowledge-intensive question answering across 57 domains [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2009.03300.pdf).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MMLU"
        }
      },
      {
        "value": "MATH - # eval",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MATH"
        }
      },
      {
        "value": "MATH - # train",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MATH"
        }
      },
      {
        "value": "MATH - truncated",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MATH"
        }
      },
      {
        "value": "MATH - # prompt tokens",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MATH"
        }
      },
      {
        "value": "MATH - # output tokens",
        "description": "The MATH benchmark for measuring mathematical problem solving on competition math problems with chain-of-thought style reasoning [(Hendrycks et al., 2021)](https://arxiv.org/pdf/2103.03874.pdf).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MATH"
        }
      },
      {
        "value": "GSM8K - # eval",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "GSM8K - # train",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "GSM8K - truncated",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "GSM8K - # prompt tokens",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "GSM8K - # output tokens",
        "description": "The grade school math word problems dataset (GSM8K) for testing mathematical reasoning on grade-school math problems [(Cobbe et al., 2021)](https://arxiv.org/pdf/2110.14168.pdf).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "GSM8K"
        }
      },
      {
        "value": "LegalBench - # eval",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "LegalBench - # train",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "LegalBench - truncated",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "LegalBench - # prompt tokens",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "LegalBench - # output tokens",
        "description": "LegalBench is a large collaboratively constructed benchmark of legal reasoning tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "LegalBench"
        }
      },
      {
        "value": "MedQA - # eval",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "MedQA"
        }
      },
      {
        "value": "MedQA - # train",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "MedQA"
        }
      },
      {
        "value": "MedQA - truncated",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "MedQA"
        }
      },
      {
        "value": "MedQA - # prompt tokens",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "MedQA"
        }
      },
      {
        "value": "MedQA - # output tokens",
        "description": "MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "MedQA"
        }
      },
      {
        "value": "WMT 2014 - # eval",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\n# eval: Number of evaluation instances.",
        "markdown": false,
        "metadata": {
          "metric": "# eval",
          "run_group": "WMT 2014"
        }
      },
      {
        "value": "WMT 2014 - # train",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\n# train: Number of training instances (e.g., in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "# train",
          "run_group": "WMT 2014"
        }
      },
      {
        "value": "WMT 2014 - truncated",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\ntruncated: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).",
        "markdown": false,
        "metadata": {
          "metric": "truncated",
          "run_group": "WMT 2014"
        }
      },
      {
        "value": "WMT 2014 - # prompt tokens",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\n# prompt tokens: Number of tokens in the prompt.",
        "markdown": false,
        "metadata": {
          "metric": "# prompt tokens",
          "run_group": "WMT 2014"
        }
      },
      {
        "value": "WMT 2014 - # output tokens",
        "description": "WMT 2014 is a collection of machine translation datasets [(website)](https://www.statmt.org/wmt14/index.html).\n\n# output tokens: Actual number of output tokens.",
        "markdown": false,
        "metadata": {
          "metric": "# output tokens",
          "run_group": "WMT 2014"
        }
      }
    ],
    "rows": [
      [
        {
          "value": "Phi-3 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 4.391549295774648,
          "description": "min=4.392, mean=4.392, max=4.392, sum=4.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 3685.707042253521,
          "description": "min=3685.707, mean=3685.707, max=3685.707, sum=3685.707 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 7.245070422535211,
          "description": "min=7.245, mean=7.245, max=7.245, sum=7.245 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 4.83,
          "description": "min=4.83, mean=4.83, max=4.83, sum=4.83 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 2307.695,
          "description": "min=2307.695, mean=2307.695, max=2307.695, sum=2307.695 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 8.371,
          "description": "min=8.371, mean=8.371, max=8.371, sum=8.371 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 156.383,
          "description": "min=156.383, mean=156.383, max=156.383, sum=156.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 10.079,
          "description": "min=10.079, mean=10.079, max=10.079, sum=10.079 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 291.574,
          "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 531.5470877192982,
          "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1438.6362030100095,
          "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 372.1276343562145,
          "description": "min=357.548, mean=372.128, max=392.767, sum=2604.893 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 400.0,
          "description": "min=400, mean=400, max=400, sum=400 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 4.176734693877551,
          "description": "min=1.884, mean=4.177, max=5, sum=20.884 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0008163265306122449,
          "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1033.8183708736613,
          "description": "min=229.137, mean=1033.818, max=3646.718, sum=5169.092 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.3564703389458466,
          "description": "min=1, mean=1.356, max=1.979, sum=6.782 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-medium-4k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1243.9005964214712,
          "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 150.28751290334915,
          "description": "min=135.523, mean=150.288, max=172.972, sum=751.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        },
        {
          "value": 99.65089463220676,
          "description": "min=98.254, mean=99.651, max=100, sum=498.254 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-medium-4k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-medium-4k-instruct"
          ]
        }
      ],
      [
        {
          "value": "Phi-3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 3485.6704225352114,
          "description": "min=3485.67, mean=3485.67, max=3485.67, sum=3485.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 33.709859154929575,
          "description": "min=33.71, mean=33.71, max=33.71, sum=33.71 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 4.965,
          "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1675.981,
          "description": "min=1675.981, mean=1675.981, max=1675.981, sum=1675.981 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 16.786,
          "description": "min=16.786, mean=16.786, max=16.786, sum=16.786 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 129.127,
          "description": "min=129.127, mean=129.127, max=129.127, sum=129.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 36.311,
          "description": "min=36.311, mean=36.311, max=36.311, sum=36.311 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 249.782,
          "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 467.71996491228066,
          "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1262.9108741840687,
          "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 115.23627800867702,
          "description": "min=57.779, mean=115.236, max=283.904, sum=806.654 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "description": "No matching runs",
          "markdown": false
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1512.6870529886412,
          "description": "min=197.442, mean=1512.687, max=6294.008, sum=7563.435 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1.192017037143267,
          "description": "min=1, mean=1.192, max=1.538, sum=5.96 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-3-small-8k-instruct",
            "legalbench:subset=proa,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1027.4135188866799,
          "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 138.04258583116683,
          "description": "min=114.901, mean=138.043, max=158.185, sum=690.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        },
        {
          "value": 96.96643456568283,
          "description": "min=96.311, mean=96.966, max=98.575, sum=484.832 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=de-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-3-small-8k-instruct",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-3-small-8k-instruct"
          ]
        }
      ],
      [
        {
          "value": "DBRX Instruct",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 3522.6704225352114,
          "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1762.593,
          "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 173.127,
          "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 242.782,
          "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 460.71996491228066,
          "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=databricks_dbrx-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1323.910874184069,
          "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1020.035,
          "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1570.162971355988,
          "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=databricks_dbrx-instruct",
            "legalbench:subset=corporate_lobbying,model=databricks_dbrx-instruct",
            "legalbench:subset=function_of_decision_section,model=databricks_dbrx-instruct",
            "legalbench:subset=international_citizenship_questions,model=databricks_dbrx-instruct",
            "legalbench:subset=proa,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1020.4135188866799,
          "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 193.04258583116683,
          "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=de-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=fr-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=hi-en,model=databricks_dbrx-instruct",
            "wmt_14:language_pair=ru-en,model=databricks_dbrx-instruct"
          ]
        }
      ],
      [
        {
          "value": "DeepSeek LLM Chat (67B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 4.946478873239436,
          "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 3583.1464788732396,
          "description": "min=3583.146, mean=3583.146, max=3583.146, sum=3583.146 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 4.841,
          "description": "min=4.841, mean=4.841, max=4.841, sum=4.841 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.024,
          "description": "min=0.024, mean=0.024, max=0.024, sum=0.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 2192.734,
          "description": "min=2192.734, mean=2192.734, max=2192.734, sum=2192.734 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 199.39,
          "description": "min=199.39, mean=199.39, max=199.39, sum=199.39 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 253.206,
          "description": "min=253.206, mean=253.206, max=253.206, sum=253.206 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 490.9413333333334,
          "description": "min=382.07, mean=490.941, max=646.667, sum=2454.707 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1443.2895059403625,
          "description": "min=1012.548, mean=1443.29, max=2448.25, sum=10103.027 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1233.708,
          "description": "min=1233.708, mean=1233.708, max=1233.708, sum=1233.708 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 4.201224489795918,
          "description": "min=2.006, mean=4.201, max=5, sum=21.006 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 990.259348667894,
          "description": "min=269.379, mean=990.259, max=3325.551, sum=4951.297 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=corporate_lobbying,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=function_of_decision_section,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=international_citizenship_questions,model=deepseek-ai_deepseek-llm-67b-chat",
            "legalbench:subset=proa,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1084.234592445328,
          "description": "min=1084.235, mean=1084.235, max=1084.235, sum=1084.235 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 220.29060445022174,
          "description": "min=203.736, mean=220.291, max=255.861, sum=1101.453 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=de-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=fr-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=hi-en,model=deepseek-ai_deepseek-llm-67b-chat",
            "wmt_14:language_pair=ru-en,model=deepseek-ai_deepseek-llm-67b-chat"
          ]
        }
      ],
      [
        {
          "value": "Falcon (40B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 2.0225352112676056,
          "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1692.3295774647888,
          "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 4.598,
          "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.039,
          "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1586.717,
          "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.991,
          "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 124.246,
          "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 251.174,
          "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 500.12014035087725,
          "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-40b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 6.818102949681896,
          "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1150.0493709178531,
          "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1056.967,
          "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 3.853061224489796,
          "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0032653061224489797,
          "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 566.6935553560819,
          "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.9751020408163266,
          "description": "min=0.876, mean=0.975, max=1, sum=4.876 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-40b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-40b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-40b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-40b",
            "legalbench:subset=proa,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1048.624254473161,
          "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 162.45444400902278,
          "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-40b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-40b"
          ]
        }
      ],
      [
        {
          "value": "Falcon (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 2.0225352112676056,
          "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1692.3295774647888,
          "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 4.598,
          "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.039,
          "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1586.717,
          "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.99,
          "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 124.246,
          "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 251.174,
          "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 500.12014035087725,
          "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=tiiuae_falcon-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 6.818102949681896,
          "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1150.0493709178531,
          "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1056.967,
          "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 3.853061224489796,
          "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0032653061224489797,
          "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 566.6935553560819,
          "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.9963265306122449,
          "description": "min=0.982, mean=0.996, max=1, sum=4.982 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=tiiuae_falcon-7b",
            "legalbench:subset=corporate_lobbying,model=tiiuae_falcon-7b",
            "legalbench:subset=function_of_decision_section,model=tiiuae_falcon-7b",
            "legalbench:subset=international_citizenship_questions,model=tiiuae_falcon-7b",
            "legalbench:subset=proa,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1048.624254473161,
          "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 162.45444400902278,
          "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        },
        {
          "value": 0.9997596153846153,
          "description": "min=0.999, mean=1.0, max=1, sum=4.999 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=de-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=fr-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=hi-en,model=tiiuae_falcon-7b",
            "wmt_14:language_pair=ru-en,model=tiiuae_falcon-7b"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (27B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 3437.994366197183,
          "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 4.953,
          "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.009,
          "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1911.526,
          "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 143.995,
          "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.993,
          "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 248.508,
          "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 481.5305263157895,
          "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-27b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1546.699013263404,
          "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-27b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-27b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-27b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-27b-it",
            "legalbench:subset=proa,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1029.4811133200794,
          "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 110.97025108961614,
          "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-27b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-27b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma 2 Instruct (9B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 3437.994366197183,
          "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 4.953,
          "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.009,
          "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1911.526,
          "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 143.995,
          "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 248.508,
          "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 481.5305263157895,
          "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-2-9b-it",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-2-9b-it,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1546.699013263404,
          "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-2-9b-it",
            "legalbench:subset=corporate_lobbying,model=google_gemma-2-9b-it",
            "legalbench:subset=function_of_decision_section,model=google_gemma-2-9b-it",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-2-9b-it",
            "legalbench:subset=proa,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1029.4811133200794,
          "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 110.97025108961614,
          "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=de-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=fr-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=hi-en,model=google_gemma-2-9b-it",
            "wmt_14:language_pair=ru-en,model=google_gemma-2-9b-it"
          ]
        }
      ],
      [
        {
          "value": "Gemma (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 3411.994366197183,
          "description": "min=3411.994, mean=3411.994, max=3411.994, sum=3411.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 4.94,
          "description": "min=4.94, mean=4.94, max=4.94, sum=4.94 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.01,
          "description": "min=0.01, mean=0.01, max=0.01, sum=0.01 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 1879.978,
          "description": "min=1879.978, mean=1879.978, max=1879.978, sum=1879.978 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemma-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 125.995,
          "description": "min=125.995, mean=125.995, max=125.995, sum=125.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemma-7b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 240.508,
          "description": "min=240.508, mean=240.508, max=240.508, sum=240.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 473.5305263157895,
          "description": "min=372.91, mean=473.531, max=626.553, sum=2367.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemma-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemma-7b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 4.794693877551021,
          "description": "min=4, mean=4.795, max=5, sum=23.973 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 1536.5573806103425,
          "description": "min=193.916, mean=1536.557, max=6379.163, sum=7682.787 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemma-7b",
            "legalbench:subset=corporate_lobbying,model=google_gemma-7b",
            "legalbench:subset=function_of_decision_section,model=google_gemma-7b",
            "legalbench:subset=international_citizenship_questions,model=google_gemma-7b",
            "legalbench:subset=proa,model=google_gemma-7b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 1021.4811133200795,
          "description": "min=1021.481, mean=1021.481, max=1021.481, sum=1021.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemma-7b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        },
        {
          "value": 103.97025108961614,
          "description": "min=73.732, mean=103.97, max=130.366, sum=519.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemma-7b",
            "wmt_14:language_pair=de-en,model=google_gemma-7b",
            "wmt_14:language_pair=fr-en,model=google_gemma-7b",
            "wmt_14:language_pair=hi-en,model=google_gemma-7b",
            "wmt_14:language_pair=ru-en,model=google_gemma-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (13B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 4.408450704225352,
          "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 3669.8084507042254,
          "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 4.831,
          "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 2289.357,
          "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.986,
          "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 137.383,
          "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 282.574,
          "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 522.5470877192982,
          "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-13b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1438.6362030100095,
          "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 4.177142857142857,
          "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0008163265306122449,
          "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1027.3502076083553,
          "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-13b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-13b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-13b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-13b",
            "legalbench:subset=proa,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1234.9005964214712,
          "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-13b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 142.28751290334915,
          "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-13b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-13b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 4.408450704225352,
          "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 3669.8084507042254,
          "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 4.831,
          "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 2289.357,
          "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.996,
          "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 137.383,
          "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 282.574,
          "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 522.5470877192982,
          "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1438.6362030100095,
          "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 4.177142857142857,
          "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0008163265306122449,
          "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1027.3502076083553,
          "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-70b",
            "legalbench:subset=proa,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1234.9005964214712,
          "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-70b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 142.28751290334915,
          "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 2 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 4.408450704225352,
          "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 3669.8084507042254,
          "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 4.831,
          "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 2289.357,
          "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.958,
          "description": "min=0.958, mean=0.958, max=0.958, sum=0.958 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 137.383,
          "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.996,
          "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 282.574,
          "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 522.5470877192982,
          "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1438.6362030100095,
          "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 4.177142857142857,
          "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0008163265306122449,
          "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1027.3502076083553,
          "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-2-7b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-2-7b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-2-7b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-2-7b",
            "legalbench:subset=proa,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1234.9005964214712,
          "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-2-7b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 142.28751290334915,
          "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=de-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=fr-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=hi-en,model=meta_llama-2-7b",
            "wmt_14:language_pair=ru-en,model=meta_llama-2-7b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (70B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 3460.2676056338028,
          "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 4.965,
          "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1658.348,
          "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 112.12,
          "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 242.776,
          "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 460.6862105263158,
          "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-70b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1507.4065013565441,
          "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-70b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-70b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-70b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-70b",
            "legalbench:subset=proa,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1018.2743538767396,
          "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-70b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 109.86804366111025,
          "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-70b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-70b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3 (8B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 3460.2676056338028,
          "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 4.965,
          "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1658.348,
          "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.999,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 112.12,
          "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 242.776,
          "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 460.6862105263158,
          "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3-8b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1507.4065013565441,
          "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3-8b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3-8b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3-8b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3-8b",
            "legalbench:subset=proa,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1018.2743538767396,
          "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3-8b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 109.86804366111025,
          "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=de-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=fr-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=hi-en,model=meta_llama-3-8b",
            "wmt_14:language_pair=ru-en,model=meta_llama-3-8b"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (405B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 9.904225352112675,
          "description": "min=9.904, mean=9.904, max=9.904, sum=9.904 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1716.78,
          "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 8.741,
          "description": "min=8.741, mean=8.741, max=8.741, sum=8.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 8.576,
          "description": "min=8.576, mean=8.576, max=8.576, sum=8.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 232.69774473452566,
          "description": "min=175.942, mean=232.698, max=270.904, sum=1628.884 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 122.777,
          "description": "min=122.777, mean=122.777, max=122.777, sum=122.777 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-405b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1513.8824197238912,
          "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 2.4069553133514985,
          "description": "min=2, mean=2.407, max=3, sum=12.035 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-405b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 120.71178123566294,
          "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        },
        {
          "value": 26.055818454656674,
          "description": "min=24.598, mean=26.056, max=26.819, sum=130.279 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-405b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-405b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (70B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 9.033802816901408,
          "description": "min=9.034, mean=9.034, max=9.034, sum=9.034 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1716.78,
          "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 8.203,
          "description": "min=8.203, mean=8.203, max=8.203, sum=8.203 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 7.222,
          "description": "min=7.222, mean=7.222, max=7.222, sum=7.222 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 243.36764411525732,
          "description": "min=184.733, mean=243.368, max=279.105, sum=1703.574 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 127.086,
          "description": "min=127.086, mean=127.086, max=127.086, sum=127.086 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-70b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1513.8824197238912,
          "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 2.5376711028251826,
          "description": "min=2, mean=2.538, max=4.032, sum=12.688 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-70b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 120.71178123566294,
          "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        },
        {
          "value": 25.78567441504817,
          "description": "min=24.231, mean=25.786, max=26.692, sum=128.928 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-70b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-70b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.1 Instruct Turbo (8B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 7.2873239436619714,
          "description": "min=7.287, mean=7.287, max=7.287, sum=7.287 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1716.78,
          "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 8.736,
          "description": "min=8.736, mean=8.736, max=8.736, sum=8.736 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 11.732,
          "description": "min=11.732, mean=11.732, max=11.732, sum=11.732 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 253.98170179473732,
          "description": "min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 150.02,
          "description": "min=150.02, mean=150.02, max=150.02, sum=150.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.1-8b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1513.8824197238912,
          "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 6.823557876005701,
          "description": "min=2.032, mean=6.824, max=10.886, sum=34.118 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.1-8b-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 120.71178123566294,
          "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        },
        {
          "value": 25.778561802263347,
          "description": "min=24.354, mean=25.779, max=26.833, sum=128.893 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.1-8b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.1-8b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (11B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1716.785,
          "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-11b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1513.8824197238912,
          "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 120.86804366111025,
          "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-11b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-11b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Llama 3.2 Vision Instruct Turbo (90B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1716.785,
          "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-3.2-90b-vision-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1513.8824197238912,
          "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=corporate_lobbying,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=function_of_decision_section,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "legalbench:subset=proa,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 120.86804366111025,
          "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=de-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=meta_llama-3.2-90b-vision-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=meta_llama-3.2-90b-vision-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "LLaMA (65B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1.4338028169014085,
          "description": "min=1.434, mean=1.434, max=1.434, sum=1.434 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1539.5859154929578,
          "description": "min=1539.586, mean=1539.586, max=1539.586, sum=1539.586 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 3.722,
          "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.049,
          "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 1407.129,
          "description": "min=1407.129, mean=1407.129, max=1407.129, sum=1407.129 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.985,
          "description": "min=0.985, mean=0.985, max=0.985, sum=0.985 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=meta_llama-65b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 137.383,
          "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=meta_llama-65b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 282.574,
          "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 522.5470877192982,
          "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-65b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-65b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 6.896761133603239,
          "description": "min=2.962, mean=6.897, max=8, sum=48.277 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 1214.7073423969382,
          "description": "min=971.652, mean=1214.707, max=1552.038, sum=8502.951 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=meta_llama-65b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=meta_llama-65b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 3.8048979591836734,
          "description": "min=0.024, mean=3.805, max=5, sum=19.024 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.006122448979591836,
          "description": "min=0, mean=0.006, max=0.031, sum=0.031 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 595.1612280165185,
          "description": "min=222.137, mean=595.161, max=1481.433, sum=2975.806 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.9763265306122448,
          "description": "min=0.882, mean=0.976, max=1, sum=4.882 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=meta_llama-65b",
            "legalbench:subset=corporate_lobbying,model=meta_llama-65b",
            "legalbench:subset=function_of_decision_section,model=meta_llama-65b",
            "legalbench:subset=international_citizenship_questions,model=meta_llama-65b",
            "legalbench:subset=proa,model=meta_llama-65b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1234.9005964214712,
          "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=meta_llama-65b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        },
        {
          "value": 142.28751290334915,
          "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=meta_llama-65b",
            "wmt_14:language_pair=de-en,model=meta_llama-65b",
            "wmt_14:language_pair=fr-en,model=meta_llama-65b",
            "wmt_14:language_pair=hi-en,model=meta_llama-65b",
            "wmt_14:language_pair=ru-en,model=meta_llama-65b"
          ]
        }
      ],
      [
        {
          "value": "Mistral Instruct v0.3 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 3924.3295774647886,
          "description": "min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 7.107042253521127,
          "description": "min=7.107, mean=7.107, max=7.107, sum=7.107 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 2498.79,
          "description": "min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 12.448,
          "description": "min=12.448, mean=12.448, max=12.448, sum=12.448 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 172.069,
          "description": "min=172.069, mean=172.069, max=172.069, sum=172.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 20.461,
          "description": "min=20.461, mean=20.461, max=20.461, sum=20.461 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 289.15,
          "description": "min=289.15, mean=289.15, max=289.15, sum=289.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 532.0910877192983,
          "description": "min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1455.2664139976257,
          "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 149.99043902740354,
          "description": "min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 1187.268,
          "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 196.611,
          "description": "min=196.611, mean=196.611, max=196.611, sum=196.611 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-instruct-v0.3,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1750.7482458432962,
          "description": "min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 9.17419274343898,
          "description": "min=2, mean=9.174, max=15.242, sum=45.871 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-instruct-v0.3",
            "legalbench:subset=proa,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1202.0934393638172,
          "description": "min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 162.43317355482492,
          "description": "min=148.306, mean=162.433, max=181.018, sum=812.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        },
        {
          "value": 30.510483732222053,
          "description": "min=28.3, mean=30.51, max=31.912, sum=152.552 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-instruct-v0.3",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-instruct-v0.3"
          ]
        }
      ],
      [
        {
          "value": "Mistral v0.1 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 4.574647887323944,
          "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 3627.7154929577464,
          "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 4.832,
          "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 2268.728,
          "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.988,
          "description": "min=0.988, mean=0.988, max=0.988, sum=0.988 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 142.069,
          "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1455.2664139976257,
          "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1187.268,
          "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 4.1938775510204085,
          "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 998.5025315575822,
          "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.9983673469387755,
          "description": "min=0.992, mean=0.998, max=1, sum=4.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-7b-v0.1",
            "legalbench:subset=proa,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 144.43317355482492,
          "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-7b-v0.1",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-7b-v0.1"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x22B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 3886.3295774647886,
          "description": "min=3886.33, mean=3886.33, max=3886.33, sum=3886.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 2468.79,
          "description": "min=2468.79, mean=2468.79, max=2468.79, sum=2468.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 142.069,
          "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x22b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1455.2664139976257,
          "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1187.268,
          "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1733.148245843296,
          "description": "min=219.453, mean=1733.148, max=7207.488, sum=8665.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x22b",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x22b",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x22b",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x22b",
            "legalbench:subset=proa,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 144.43317355482492,
          "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x22b",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x22b"
          ]
        }
      ],
      [
        {
          "value": "Mixtral (8x7B 32K seqlen)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 4.574647887323944,
          "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 3627.7154929577464,
          "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 4.832,
          "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 2268.728,
          "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.991,
          "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 142.069,
          "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.999,
          "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1455.2664139976257,
          "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1187.268,
          "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 4.1938775510204085,
          "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 998.5025315575822,
          "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.9995918367346939,
          "description": "min=0.998, mean=1.0, max=1, sum=4.998 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=corporate_lobbying,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=function_of_decision_section,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mixtral-8x7b-32kseqlen",
            "legalbench:subset=proa,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 144.43317355482492,
          "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        },
        {
          "value": 0.998798076923077,
          "description": "min=0.994, mean=0.999, max=1, sum=4.994 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=de-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=fr-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=hi-en,model=mistralai_mixtral-8x7b-32kseqlen",
            "wmt_14:language_pair=ru-en,model=mistralai_mixtral-8x7b-32kseqlen"
          ]
        }
      ],
      [
        {
          "value": "OLMo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.9690140845070423,
          "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1691.081690140845,
          "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 4.703,
          "description": "min=4.703, mean=4.703, max=4.703, sum=4.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.037,
          "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1495.001,
          "description": "min=1495.001, mean=1495.001, max=1495.001, sum=1495.001 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.998,
          "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 117.299,
          "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 251.556,
          "description": "min=251.556, mean=251.556, max=251.556, sum=251.556 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 467.935649122807,
          "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=allenai_olmo-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 6.9758530942741475,
          "description": "min=3.173, mean=6.976, max=8, sum=48.831 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1111.0696790674758,
          "description": "min=860.23, mean=1111.07, max=1508.423, sum=7777.488 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 939.582,
          "description": "min=939.582, mean=939.582, max=939.582, sum=939.582 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 3.859591836734694,
          "description": "min=0.298, mean=3.86, max=5, sum=19.298 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.002857142857142857,
          "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 559.9203981649337,
          "description": "min=206.779, mean=559.92, max=1493.837, sum=2799.602 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=allenai_olmo-7b",
            "legalbench:subset=corporate_lobbying,model=allenai_olmo-7b",
            "legalbench:subset=function_of_decision_section,model=allenai_olmo-7b",
            "legalbench:subset=international_citizenship_questions,model=allenai_olmo-7b",
            "legalbench:subset=proa,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 994.5884691848906,
          "description": "min=994.588, mean=994.588, max=994.588, sum=994.588 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=allenai_olmo-7b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 144.94816676861905,
          "description": "min=129.879, mean=144.948, max=167.177, sum=724.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=de-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=fr-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=hi-en,model=allenai_olmo-7b",
            "wmt_14:language_pair=ru-en,model=allenai_olmo-7b"
          ]
        }
      ],
      [
        {
          "value": "Phi-2",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 2.084507042253521,
          "description": "min=2.085, mean=2.085, max=2.085, sum=2.085 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1705.0056338028169,
          "description": "min=1705.006, mean=1705.006, max=1705.006, sum=1705.006 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 4.706,
          "description": "min=4.706, mean=4.706, max=4.706, sum=4.706 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.036,
          "description": "min=0.036, mean=0.036, max=0.036, sum=0.036 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1493.994,
          "description": "min=1493.994, mean=1493.994, max=1493.994, sum=1493.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 116.254,
          "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=microsoft_phi-2"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 254.216,
          "description": "min=254.216, mean=254.216, max=254.216, sum=254.216 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 472.2740350877192,
          "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 6.915558126084441,
          "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1162.1258475895563,
          "description": "min=906.541, mean=1162.126, max=1511.442, sum=8134.881 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 938.893,
          "description": "min=938.893, mean=938.893, max=938.893, sum=938.893 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=microsoft_phi-2"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 3.8673469387755106,
          "description": "min=0.337, mean=3.867, max=5, sum=19.337 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.002857142857142857,
          "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 566.2485439511586,
          "description": "min=205.632, mean=566.249, max=1519.543, sum=2831.243 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=microsoft_phi-2",
            "legalbench:subset=corporate_lobbying,model=microsoft_phi-2",
            "legalbench:subset=function_of_decision_section,model=microsoft_phi-2",
            "legalbench:subset=international_citizenship_questions,model=microsoft_phi-2",
            "legalbench:subset=proa,model=microsoft_phi-2"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1038.8330019880716,
          "description": "min=1038.833, mean=1038.833, max=1038.833, sum=1038.833 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=microsoft_phi-2"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        },
        {
          "value": 181.69235022556967,
          "description": "min=136.93, mean=181.692, max=241.656, sum=908.462 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=microsoft_phi-2",
            "wmt_14:language_pair=de-en,model=microsoft_phi-2",
            "wmt_14:language_pair=fr-en,model=microsoft_phi-2",
            "wmt_14:language_pair=hi-en,model=microsoft_phi-2",
            "wmt_14:language_pair=ru-en,model=microsoft_phi-2"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 Chat (110B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 3502.912676056338,
          "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 10.290140845070422,
          "description": "min=10.29, mean=10.29, max=10.29, sum=10.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 2017.955,
          "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 8.509,
          "description": "min=8.509, mean=8.509, max=8.509, sum=8.509 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 146.262,
          "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 8.99,
          "description": "min=8.99, mean=8.99, max=8.99, sum=8.99 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 249.846,
          "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 477.8357192982456,
          "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 156.85484968134907,
          "description": "min=104.174, mean=156.855, max=202.368, sum=1097.984 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 175.784,
          "description": "min=175.784, mean=175.784, max=175.784, sum=175.784 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-110b-chat,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1557.0883229968654,
          "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 2.3142312634447153,
          "description": "min=2, mean=2.314, max=2.958, sum=11.571 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-110b-chat",
            "legalbench:subset=proa,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1052.4850894632207,
          "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 142.65662658663405,
          "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        },
        {
          "value": 26.94872734745374,
          "description": "min=25.499, mean=26.949, max=27.529, sum=134.744 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-110b-chat",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-110b-chat"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (14B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 3468.912676056338,
          "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1990.955,
          "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 119.262,
          "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 242.846,
          "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 470.8357192982456,
          "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-14b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1542.0883229968654,
          "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-14b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-14b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-14b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-14b",
            "legalbench:subset=proa,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1045.4850894632207,
          "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 126.65662658663405,
          "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-14b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-14b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (32B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 3468.912676056338,
          "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1990.955,
          "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 119.262,
          "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 242.846,
          "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 470.8357192982456,
          "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-32b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1542.0883229968654,
          "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-32b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-32b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-32b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-32b",
            "legalbench:subset=proa,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1045.4850894632207,
          "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 126.65662658663405,
          "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-32b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-32b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 4.994366197183099,
          "description": "min=4.994, mean=4.994, max=4.994, sum=4.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 3465.8591549295775,
          "description": "min=3465.859, mean=3465.859, max=3465.859, sum=3465.859 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 4.863,
          "description": "min=4.863, mean=4.863, max=4.863, sum=4.863 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.022,
          "description": "min=0.022, mean=0.022, max=0.022, sum=0.022 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1846.221,
          "description": "min=1846.221, mean=1846.221, max=1846.221, sum=1846.221 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 119.262,
          "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 242.846,
          "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 470.8357192982456,
          "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-72b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 4.25061224489796,
          "description": "min=2.253, mean=4.251, max=5, sum=21.253 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 940.3768944254368,
          "description": "min=192.453, mean=940.377, max=3422.157, sum=4701.884 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-72b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-72b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-72b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-72b",
            "legalbench:subset=proa,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1045.4850894632207,
          "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 126.65662658663405,
          "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-72b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-72b"
          ]
        }
      ],
      [
        {
          "value": "Qwen1.5 (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 3468.912676056338,
          "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1990.955,
          "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 119.262,
          "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 242.846,
          "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 470.8357192982456,
          "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1542.0883229968654,
          "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen1.5-7b",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen1.5-7b",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen1.5-7b",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen1.5-7b",
            "legalbench:subset=proa,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1045.4850894632207,
          "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 126.65662658663405,
          "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=de-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=fr-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=hi-en,model=qwen_qwen1.5-7b",
            "wmt_14:language_pair=ru-en,model=qwen_qwen1.5-7b"
          ]
        }
      ],
      [
        {
          "value": "Qwen2 Instruct (72B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 3502.912676056338,
          "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 11.64225352112676,
          "description": "min=11.642, mean=11.642, max=11.642, sum=11.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 2017.955,
          "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 9.044,
          "description": "min=9.044, mean=9.044, max=9.044, sum=9.044 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 146.262,
          "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 6.433,
          "description": "min=6.433, mean=6.433, max=6.433, sum=6.433 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 249.846,
          "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 477.8357192982456,
          "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 173.89384019579856,
          "description": "min=145.36, mean=173.894, max=202.346, sum=1217.257 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 166.4,
          "description": "min=166.4, mean=166.4, max=166.4, sum=166.4 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2-72b-instruct,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1557.0883229968654,
          "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 2.2988842678904344,
          "description": "min=2, mean=2.299, max=3.042, sum=11.494 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2-72b-instruct",
            "legalbench:subset=proa,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1052.4850894632207,
          "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 142.65662658663405,
          "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        },
        {
          "value": 27.028530260743235,
          "description": "min=25.368, mean=27.029, max=27.714, sum=135.143 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=de-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2-72b-instruct",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2-72b-instruct"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (72B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 3492.912676056338,
          "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 8.71830985915493,
          "description": "min=8.718, mean=8.718, max=8.718, sum=8.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 2007.955,
          "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 17.681,
          "description": "min=17.681, mean=17.681, max=17.681, sum=17.681 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 136.262,
          "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 15.132,
          "description": "min=15.132, mean=15.132, max=15.132, sum=15.132 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 249.846,
          "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 477.8357192982456,
          "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 186.76438709076407,
          "description": "min=147.558, mean=186.764, max=230.288, sum=1307.351 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 198.303,
          "description": "min=198.303, mean=198.303, max=198.303, sum=198.303 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 1558.8883229968653,
          "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 2.452587326627195,
          "description": "min=2, mean=2.453, max=3.021, sum=12.263 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-72b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-72b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1052.4850894632207,
          "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 132.65662658663405,
          "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        },
        {
          "value": 27.126178505887747,
          "description": "min=25.517, mean=27.126, max=27.755, sum=135.631 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-72b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-72b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Qwen2.5 Instruct Turbo (7B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 3492.912676056338,
          "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.549295774647887,
          "description": "min=5.549, mean=5.549, max=5.549, sum=5.549 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 2007.955,
          "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 8.698,
          "description": "min=8.698, mean=8.698, max=8.698, sum=8.698 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 136.262,
          "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 7.041,
          "description": "min=7.041, mean=7.041, max=7.041, sum=7.041 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 249.846,
          "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 477.8357192982456,
          "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1323.836848955025,
          "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 196.8978610559394,
          "description": "min=156.674, mean=196.898, max=240.288, sum=1378.285 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 1130.403,
          "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 194.776,
          "description": "min=194.776, mean=194.776, max=194.776, sum=194.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 1558.8883229968653,
          "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 2.4015832496773273,
          "description": "min=2, mean=2.402, max=3.084, sum=12.008 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=corporate_lobbying,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=function_of_decision_section,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=international_citizenship_questions,model=qwen_qwen2.5-7b-instruct-turbo,stop=none",
            "legalbench:subset=proa,model=qwen_qwen2.5-7b-instruct-turbo,stop=none"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1052.4850894632207,
          "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 132.65662658663405,
          "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        },
        {
          "value": 27.74173612173115,
          "description": "min=26.946, mean=27.742, max=28.649, sum=138.709 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=de-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=fr-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=hi-en,model=qwen_qwen2.5-7b-instruct-turbo",
            "wmt_14:language_pair=ru-en,model=qwen_qwen2.5-7b-instruct-turbo"
          ]
        }
      ],
      [
        {
          "value": "Arctic Instruct",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 4.261971830985916,
          "description": "min=4.262, mean=4.262, max=4.262, sum=4.262 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 3603.2169014084507,
          "description": "min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 11.907042253521126,
          "description": "min=11.907, mean=11.907, max=11.907, sum=11.907 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 4.825,
          "description": "min=4.825, mean=4.825, max=4.825, sum=4.825 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.028,
          "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 2311.514,
          "description": "min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 18.701,
          "description": "min=18.701, mean=18.701, max=18.701, sum=18.701 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 166.383,
          "description": "min=166.383, mean=166.383, max=166.383, sum=166.383 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 14.473,
          "description": "min=14.473, mean=14.473, max=14.473, sum=14.473 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 291.574,
          "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 531.5470877192982,
          "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1438.6362030100095,
          "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 98.80208187931566,
          "description": "min=82.872, mean=98.802, max=122.233, sum=691.615 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 1207.746,
          "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 189.305,
          "description": "min=189.305, mean=189.305, max=189.305, sum=189.305 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=snowflake_snowflake-arctic-instruct,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 4.162040816326531,
          "description": "min=1.81, mean=4.162, max=5, sum=20.81 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0016326530612244899,
          "description": "min=0, mean=0.002, max=0.008, sum=0.008 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1024.7220443430492,
          "description": "min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 2.4375592890361366,
          "description": "min=2, mean=2.438, max=3.421, sum=12.188 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=corporate_lobbying,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=function_of_decision_section,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=international_citizenship_questions,model=snowflake_snowflake-arctic-instruct",
            "legalbench:subset=proa,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1243.9005964214712,
          "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 160.28751290334915,
          "description": "min=145.523, mean=160.288, max=182.972, sum=801.438 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        },
        {
          "value": 30.59012702630372,
          "description": "min=28.596, mean=30.59, max=31.485, sum=152.951 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=de-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=fr-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=hi-en,model=snowflake_snowflake-arctic-instruct",
            "wmt_14:language_pair=ru-en,model=snowflake_snowflake-arctic-instruct"
          ]
        }
      ],
      [
        {
          "value": "Yi (34B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 4.867605633802817,
          "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 3611.445070422535,
          "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 4.838,
          "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 2171.698,
          "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.995,
          "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 131.695,
          "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 260.002,
          "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 502.65389473684206,
          "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-34b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1468.9352369693863,
          "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1170.814,
          "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 4.2,
          "description": "min=2, mean=4.2, max=5, sum=21 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 951.5242922438443,
          "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-34b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-34b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-34b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-34b",
            "legalbench:subset=proa,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1122.3916500994035,
          "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-34b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 187.09213851506345,
          "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-34b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-34b"
          ]
        }
      ],
      [
        {
          "value": "Yi (6B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 4.867605633802817,
          "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 3611.445070422535,
          "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 4.838,
          "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.026,
          "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 2171.698,
          "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.995,
          "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 131.695,
          "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 260.002,
          "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 502.65389473684206,
          "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1468.9352369693863,
          "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=7 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1170.814,
          "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 4.2,
          "description": "min=2, mean=4.2, max=5, sum=21 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 951.5242922438443,
          "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-6b",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-6b",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-6b",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-6b",
            "legalbench:subset=proa,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1122.3916500994035,
          "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-6b"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 187.09213851506345,
          "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=de-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-6b",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-6b"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Grande (17B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 3.2253521126760565,
          "description": "min=3.225, mean=3.225, max=3.225, sum=3.225 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 1700.7408450704224,
          "description": "min=1700.741, mean=1700.741, max=1700.741, sum=1700.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.03943661971831,
          "description": "min=5.039, mean=5.039, max=5.039, sum=5.039 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 4.697,
          "description": "min=4.697, mean=4.697, max=4.697, sum=4.697 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.038,
          "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1522.929,
          "description": "min=1522.929, mean=1522.929, max=1522.929, sum=1522.929 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.441,
          "description": "min=5.441, mean=5.441, max=5.441, sum=5.441 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 102.377,
          "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 6.614,
          "description": "min=6.614, mean=6.614, max=6.614, sum=6.614 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-grande"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 188.75,
          "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 396.7398596491228,
          "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-grande",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-grande"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 6.7781954887218046,
          "description": "min=2, mean=6.778, max=8, sum=47.447 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 943.4185034241337,
          "description": "min=450.154, mean=943.419, max=1490.395, sum=6603.93 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 140.29469320289397,
          "description": "min=74.123, mean=140.295, max=209.933, sum=982.063 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 823.394,
          "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 121.336,
          "description": "min=121.336, mean=121.336, max=121.336, sum=121.336 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-grande"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 4.001224489795918,
          "description": "min=1.006, mean=4.001, max=5, sum=20.006 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0024489795918367346,
          "description": "min=0, mean=0.002, max=0.012, sum=0.012 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 503.1459259177527,
          "description": "min=171.042, mean=503.146, max=1514.22, sum=2515.73 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 2.0563001835066452,
          "description": "min=2, mean=2.056, max=2.216, sum=10.282 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-grande",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-grande",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-grande",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-grande",
            "legalbench:subset=proa,model=ai21_j2-grande"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 758.6222664015904,
          "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-grande"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        },
        {
          "value": 135.46828404572565,
          "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        },
        {
          "value": 19.050931430646887,
          "description": "min=17.372, mean=19.051, max=21.34, sum=95.255 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-grande",
            "wmt_14:language_pair=de-en,model=ai21_j2-grande",
            "wmt_14:language_pair=fr-en,model=ai21_j2-grande",
            "wmt_14:language_pair=hi-en,model=ai21_j2-grande",
            "wmt_14:language_pair=ru-en,model=ai21_j2-grande"
          ]
        }
      ],
      [
        {
          "value": "Jurassic-2 Jumbo (178B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 2534.4338028169013,
          "description": "min=2534.434, mean=2534.434, max=2534.434, sum=2534.434 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 6.583098591549295,
          "description": "min=6.583, mean=6.583, max=6.583, sum=6.583 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 4.931,
          "description": "min=4.931, mean=4.931, max=4.931, sum=4.931 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.012,
          "description": "min=0.012, mean=0.012, max=0.012, sum=0.012 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1687.673,
          "description": "min=1687.673, mean=1687.673, max=1687.673, sum=1687.673 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 4.785,
          "description": "min=4.785, mean=4.785, max=4.785, sum=4.785 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 102.377,
          "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.79,
          "description": "min=5.79, mean=5.79, max=5.79, sum=5.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 188.75,
          "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 396.7398596491228,
          "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_j2-jumbo",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1321.42226282263,
          "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 136.53809167621895,
          "description": "min=76.281, mean=136.538, max=220.133, sum=955.767 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 823.394,
          "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 102.036,
          "description": "min=102.036, mean=102.036, max=102.036, sum=102.036 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1120.4859259177529,
          "description": "min=171.042, mean=1120.486, max=4600.92, sum=5602.43 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 2.028218528610354,
          "description": "min=2, mean=2.028, max=2.098, sum=10.141 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_j2-jumbo",
            "legalbench:subset=corporate_lobbying,model=ai21_j2-jumbo",
            "legalbench:subset=function_of_decision_section,model=ai21_j2-jumbo",
            "legalbench:subset=international_citizenship_questions,model=ai21_j2-jumbo",
            "legalbench:subset=proa,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 758.6222664015904,
          "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 135.46828404572565,
          "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        },
        {
          "value": 24.062830708059337,
          "description": "min=19.839, mean=24.063, max=30.439, sum=120.314 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=de-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=fr-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=hi-en,model=ai21_j2-jumbo",
            "wmt_14:language_pair=ru-en,model=ai21_j2-jumbo"
          ]
        }
      ],
      [
        {
          "value": "Jamba Instruct",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 2555.4338028169013,
          "description": "min=2555.434, mean=2555.434, max=2555.434, sum=2555.434 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1774.04,
          "description": "min=1774.04, mean=1774.04, max=1774.04, sum=1774.04 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 118.377,
          "description": "min=118.377, mean=118.377, max=118.377, sum=118.377 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 195.75,
          "description": "min=195.75, mean=195.75, max=195.75, sum=195.75 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 403.7398596491228,
          "description": "min=315.59, mean=403.74, max=559.719, sum=2018.699 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-instruct",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1321.42226282263,
          "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 823.394,
          "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-instruct,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1127.1634769381612,
          "description": "min=177.042, mean=1127.163, max=4612.308, sum=5635.817 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-instruct",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-instruct",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-instruct",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-instruct",
            "legalbench:subset=proa,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 765.6222664015904,
          "description": "min=765.622, mean=765.622, max=765.622, sum=765.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 585.25,
          "description": "min=503, mean=585.25, max=832, sum=2341 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=4 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 143.26129939115307,
          "description": "min=129.229, mean=143.261, max=154.278, sum=573.045 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (4)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=de-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-instruct",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-instruct"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Mini",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 3595.5971830985914,
          "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 2333.076,
          "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 152.394,
          "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 261.348,
          "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 508.1380701754386,
          "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-mini",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1458.376275861588,
          "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 1163.818,
          "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-mini,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1601.842950915631,
          "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-mini",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-mini",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-mini",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-mini",
            "legalbench:subset=proa,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1085.2385685884692,
          "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 151.07662629989292,
          "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-mini",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-mini"
          ]
        }
      ],
      [
        {
          "value": "Jamba 1.5 Large",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 3595.5971830985914,
          "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 2333.076,
          "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 152.394,
          "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 261.348,
          "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 508.1380701754386,
          "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=ai21_jamba-1.5-large",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1458.376275861588,
          "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 1163.818,
          "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=ai21_jamba-1.5-large,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1601.842950915631,
          "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=ai21_jamba-1.5-large",
            "legalbench:subset=corporate_lobbying,model=ai21_jamba-1.5-large",
            "legalbench:subset=function_of_decision_section,model=ai21_jamba-1.5-large",
            "legalbench:subset=international_citizenship_questions,model=ai21_jamba-1.5-large",
            "legalbench:subset=proa,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1085.2385685884692,
          "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 151.07662629989292,
          "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=de-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=fr-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=hi-en,model=ai21_jamba-1.5-large",
            "wmt_14:language_pair=ru-en,model=ai21_jamba-1.5-large"
          ]
        }
      ],
      [
        {
          "value": "Luminous Base (13B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 2.036619718309859,
          "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1694.6422535211268,
          "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.52112676056338,
          "description": "min=5.521, mean=5.521, max=5.521, sum=5.521 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 4.717,
          "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.038,
          "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1488.14,
          "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 10.866,
          "description": "min=10.866, mean=10.866, max=10.866, sum=10.866 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 116.087,
          "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.908,
          "description": "min=5.908, mean=5.908, max=5.908, sum=5.908 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 254.652,
          "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 471.0754736842106,
          "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-base",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 6.915558126084441,
          "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1184.139339428874,
          "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 139.6365272403828,
          "description": "min=114.077, mean=139.637, max=180.663, sum=977.456 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 943.121,
          "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 400.0,
          "description": "min=400, mean=400, max=400, sum=400 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 3.866938775510204,
          "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.1330612244897959,
          "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 566.5895794484264,
          "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.6391061224489796,
          "description": "min=1, mean=1.639, max=4.027, sum=8.196 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-base",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-base",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-base",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-base",
            "legalbench:subset=proa,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1005.2286282306163,
          "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 157.2315362631901,
          "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        },
        {
          "value": 99.97375745526838,
          "description": "min=99.869, mean=99.974, max=100, sum=499.869 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-base",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-base"
          ]
        }
      ],
      [
        {
          "value": "Luminous Extended (30B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 2.036619718309859,
          "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1694.6422535211268,
          "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 6.335211267605634,
          "description": "min=6.335, mean=6.335, max=6.335, sum=6.335 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 4.717,
          "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.038,
          "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1488.14,
          "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 11.063,
          "description": "min=11.063, mean=11.063, max=11.063, sum=11.063 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 116.087,
          "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 6.869,
          "description": "min=6.869, mean=6.869, max=6.869, sum=6.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 254.652,
          "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 471.0754736842106,
          "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-extended",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 6.915558126084441,
          "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1184.139339428874,
          "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 142.86643564287382,
          "description": "min=92.684, mean=142.866, max=180.2, sum=1000.065 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 943.121,
          "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 400.0,
          "description": "min=400, mean=400, max=400, sum=400 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 3.866938775510204,
          "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.1330612244897959,
          "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 566.5895794484264,
          "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.5478898257711229,
          "description": "min=1, mean=1.548, max=3.196, sum=7.739 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-extended",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-extended",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-extended",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-extended",
            "legalbench:subset=proa,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1005.2286282306163,
          "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 157.2315362631901,
          "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=500 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-extended",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-extended"
          ]
        }
      ],
      [
        {
          "value": "Luminous Supreme (70B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 2.036619718309859,
          "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1694.6422535211268,
          "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.6845070422535215,
          "description": "min=5.685, mean=5.685, max=5.685, sum=5.685 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 4.717,
          "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.038,
          "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1488.14,
          "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 6.864,
          "description": "min=6.864, mean=6.864, max=6.864, sum=6.864 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 116.087,
          "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 4.666,
          "description": "min=4.666, mean=4.666, max=4.666, sum=4.666 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 254.652,
          "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 471.0754736842106,
          "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 6.915558126084441,
          "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1184.139339428874,
          "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 127.58738933898053,
          "description": "min=90.605, mean=127.587, max=150.635, sum=893.112 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 943.121,
          "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 400.0,
          "description": "min=400, mean=400, max=400, sum=400 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 3.866938775510204,
          "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.1330612244897959,
          "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 566.5895794484264,
          "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.2657996218650946,
          "description": "min=1, mean=1.266, max=1.769, sum=6.329 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=corporate_lobbying,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=function_of_decision_section,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=international_citizenship_questions,model=AlephAlpha_luminous-supreme",
            "legalbench:subset=proa,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1005.2286282306163,
          "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 157.2315362631901,
          "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        },
        {
          "value": 100.0,
          "description": "min=100, mean=100, max=100, sum=500 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=de-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=fr-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=hi-en,model=AlephAlpha_luminous-supreme",
            "wmt_14:language_pair=ru-en,model=AlephAlpha_luminous-supreme"
          ]
        }
      ],
      [
        {
          "value": "Claude v1.3",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 9.338028169014084,
          "description": "min=9.338, mean=9.338, max=9.338, sum=9.338 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 4.964,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1734.363,
          "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 4.973,
          "description": "min=4.973, mean=4.973, max=4.973, sum=4.973 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 3.722,
          "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 328.79,
          "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 543.747298245614,
          "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-v1.3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1361.8141219676104,
          "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 79.49312981320325,
          "description": "min=53.133, mean=79.493, max=97.564, sum=556.452 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 104.726,
          "description": "min=104.726, mean=104.726, max=104.726, sum=104.726 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 4.797959183673469,
          "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1621.3558670820687,
          "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.3542176968306323,
          "description": "min=1, mean=1.354, max=2.232, sum=6.771 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-v1.3",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-v1.3",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-v1.3",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-v1.3",
            "legalbench:subset=proa,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1092.4373757455269,
          "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 218.57322077152472,
          "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        },
        {
          "value": 25.611364027374215,
          "description": "min=24.004, mean=25.611, max=26.28, sum=128.057 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=de-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-v1.3",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-v1.3"
          ]
        }
      ],
      [
        {
          "value": "Claude Instant 1.2",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 17.149295774647886,
          "description": "min=17.149, mean=17.149, max=17.149, sum=17.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 4.964,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1734.363,
          "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 8.217,
          "description": "min=8.217, mean=8.217, max=8.217, sum=8.217 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.113,
          "description": "min=5.113, mean=5.113, max=5.113, sum=5.113 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 328.79,
          "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 543.747298245614,
          "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1361.8141219676104,
          "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 65.95586481608514,
          "description": "min=54.491, mean=65.956, max=76.513, sum=461.691 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 105.998,
          "description": "min=105.998, mean=105.998, max=105.998, sum=105.998 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 4.797959183673469,
          "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1621.3558670820687,
          "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.6459798365122615,
          "description": "min=1, mean=1.646, max=2.219, sum=8.23 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-instant-1.2",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-instant-1.2",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-instant-1.2",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-instant-1.2",
            "legalbench:subset=proa,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1092.4373757455269,
          "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 218.57322077152472,
          "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        },
        {
          "value": 25.578513056277718,
          "description": "min=24.177, mean=25.579, max=26.326, sum=127.893 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=de-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-instant-1.2",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-instant-1.2"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.0",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 10.56056338028169,
          "description": "min=10.561, mean=10.561, max=10.561, sum=10.561 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 4.964,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1734.363,
          "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 7.605,
          "description": "min=7.605, mean=7.605, max=7.605, sum=7.605 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 7.206,
          "description": "min=7.206, mean=7.206, max=7.206, sum=7.206 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 328.79,
          "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 543.747298245614,
          "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.0",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1361.8141219676104,
          "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 96.47352327848044,
          "description": "min=76.07, mean=96.474, max=115.288, sum=675.315 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 78.704,
          "description": "min=78.704, mean=78.704, max=78.704, sum=78.704 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 4.797959183673469,
          "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1621.3558670820687,
          "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 3.338449275778001,
          "description": "min=1, mean=3.338, max=11.058, sum=16.692 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.0",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.0",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.0",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.0",
            "legalbench:subset=proa,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1092.4373757455269,
          "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 218.57322077152472,
          "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        },
        {
          "value": 25.65316323214559,
          "description": "min=24.254, mean=25.653, max=26.374, sum=128.266 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.0",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.0"
          ]
        }
      ],
      [
        {
          "value": "Claude 2.1",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 12.430985915492958,
          "description": "min=12.431, mean=12.431, max=12.431, sum=12.431 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 4.964,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1734.363,
          "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 19.738,
          "description": "min=19.738, mean=19.738, max=19.738, sum=19.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 11.053,
          "description": "min=11.053, mean=11.053, max=11.053, sum=11.053 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 328.79,
          "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 543.747298245614,
          "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1361.8141219676104,
          "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 96.71972910810119,
          "description": "min=79.825, mean=96.72, max=120.842, sum=677.038 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 98.553,
          "description": "min=98.553, mean=98.553, max=98.553, sum=98.553 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 4.797959183673469,
          "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1621.3558670820687,
          "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.4554741431234763,
          "description": "min=1, mean=1.455, max=2.137, sum=7.277 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-2.1",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-2.1",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-2.1",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-2.1",
            "legalbench:subset=proa,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1092.4373757455269,
          "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 218.57322077152472,
          "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        },
        {
          "value": 25.235038327725952,
          "description": "min=24.439, mean=25.235, max=26.058, sum=126.175 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=de-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-2.1",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-2.1"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Haiku (20240307)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 44.264788732394365,
          "description": "min=44.265, mean=44.265, max=44.265, sum=44.265 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1781.799,
          "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 33.024,
          "description": "min=33.024, mean=33.024, max=33.024, sum=33.024 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 50.787,
          "description": "min=50.787, mean=50.787, max=50.787, sum=50.787 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 263.79,
          "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 478.747298245614,
          "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1362.8141219676104,
          "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 29.032964841043174,
          "description": "min=3.158, mean=29.033, max=87.17, sum=203.231 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 77.518,
          "description": "min=77.518, mean=77.518, max=77.518, sum=77.518 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1557.241581367783,
          "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 9.56470087480281,
          "description": "min=1, mean=9.565, max=28.352, sum=47.824 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-haiku-20240307",
            "legalbench:subset=proa,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1027.4373757455269,
          "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 219.57322077152472,
          "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        },
        {
          "value": 48.6129454044961,
          "description": "min=27.598, mean=48.613, max=93.673, sum=243.065 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-haiku-20240307",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-haiku-20240307"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Sonnet (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 30.371830985915494,
          "description": "min=30.372, mean=30.372, max=30.372, sum=30.372 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1781.799,
          "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 31.113,
          "description": "min=31.113, mean=31.113, max=31.113, sum=31.113 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 26.563,
          "description": "min=26.563, mean=26.563, max=26.563, sum=26.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 263.79,
          "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 478.747298245614,
          "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1362.8141219676104,
          "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 52.37429092508652,
          "description": "min=44.263, mean=52.374, max=62.256, sum=366.62 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 114.663,
          "description": "min=114.663, mean=114.663, max=114.663, sum=114.663 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1557.241581367783,
          "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 9.201869121421694,
          "description": "min=1, mean=9.202, max=27.753, sum=46.009 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-sonnet-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1027.4373757455269,
          "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 219.57322077152472,
          "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        },
        {
          "value": 26.05551068588469,
          "description": "min=24.517, mean=26.056, max=27.078, sum=130.278 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-sonnet-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-sonnet-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3 Opus (20240229)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 3709.7408450704224,
          "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 13.588732394366197,
          "description": "min=13.589, mean=13.589, max=13.589, sum=13.589 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1781.799,
          "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 39.248,
          "description": "min=39.248, mean=39.248, max=39.248, sum=39.248 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 189.259,
          "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.66,
          "description": "min=5.66, mean=5.66, max=5.66, sum=5.66 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 263.79,
          "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 478.747298245614,
          "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1362.8141219676104,
          "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 113.90635737624721,
          "description": "min=82.965, mean=113.906, max=138.263, sum=797.345 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1012.712,
          "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 115.934,
          "description": "min=115.934, mean=115.934, max=115.934, sum=115.934 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1557.241581367783,
          "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.6045285459659269,
          "description": "min=1, mean=1.605, max=2.932, sum=8.023 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-opus-20240229",
            "legalbench:subset=proa,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1027.4373757455269,
          "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 219.57322077152472,
          "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        },
        {
          "value": 25.837047426976607,
          "description": "min=24.332, mean=25.837, max=26.616, sum=129.185 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-opus-20240229",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-opus-20240229"
          ]
        }
      ],
      [
        {
          "value": "Claude 3.5 Sonnet (20240620)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 3672.7408450704224,
          "description": "min=3672.741, mean=3672.741, max=3672.741, sum=3672.741 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 7.853521126760564,
          "description": "min=7.854, mean=7.854, max=7.854, sum=7.854 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1736.799,
          "description": "min=1736.799, mean=1736.799, max=1736.799, sum=1736.799 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 11.135,
          "description": "min=11.135, mean=11.135, max=11.135, sum=11.135 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 144.259,
          "description": "min=144.259, mean=144.259, max=144.259, sum=144.259 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 6.069,
          "description": "min=6.069, mean=6.069, max=6.069, sum=6.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 272.79,
          "description": "min=272.79, mean=272.79, max=272.79, sum=272.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 487.747298245614,
          "description": "min=379.26, mean=487.747, max=628.596, sum=2438.736 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1311.8141219676104,
          "description": "min=897.259, mean=1311.814, max=2329.808, sum=9182.699 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 143.9478793136688,
          "description": "min=93.333, mean=143.948, max=207.442, sum=1007.635 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 938.712,
          "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 165.163,
          "description": "min=165.163, mean=165.163, max=165.163, sum=165.163 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1566.241581367783,
          "description": "min=223.653, mean=1566.242, max=6437.398, sum=7831.208 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.3276925283235337,
          "description": "min=1, mean=1.328, max=2.053, sum=6.638 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=corporate_lobbying,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=function_of_decision_section,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=international_citizenship_questions,model=anthropic_claude-3-5-sonnet-20240620",
            "legalbench:subset=proa,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1036.4373757455269,
          "description": "min=1036.437, mean=1036.437, max=1036.437, sum=1036.437 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 162.5732207715247,
          "description": "min=141.406, mean=162.573, max=184.974, sum=812.866 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        },
        {
          "value": 25.85177875057348,
          "description": "min=24.282, mean=25.852, max=26.592, sum=129.259 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=de-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=fr-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=hi-en,model=anthropic_claude-3-5-sonnet-20240620",
            "wmt_14:language_pair=ru-en,model=anthropic_claude-3-5-sonnet-20240620"
          ]
        }
      ],
      [
        {
          "value": "Command",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 1.9408450704225353,
          "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 1660.4845070422534,
          "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 7.44225352112676,
          "description": "min=7.442, mean=7.442, max=7.442, sum=7.442 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 4.617,
          "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 0.039,
          "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 1557.639,
          "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 8.461,
          "description": "min=8.461, mean=8.461, max=8.461, sum=8.461 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 115.191,
          "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 5.679,
          "description": "min=5.679, mean=5.679, max=5.679, sum=5.679 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 246.682,
          "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 481.26021052631575,
          "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 6.877964141122035,
          "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 1177.3289276411065,
          "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 116.48968047229982,
          "description": "min=94.488, mean=116.49, max=135.115, sum=815.428 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 942.424,
          "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 94.43,
          "description": "min=94.43, mean=94.43, max=94.43, sum=94.43 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 3.8775510204081636,
          "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 0.002857142857142857,
          "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 566.5014751745068,
          "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 1.7895877106155815,
          "description": "min=1, mean=1.79, max=3.055, sum=8.948 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command",
            "legalbench:subset=corporate_lobbying,model=cohere_command",
            "legalbench:subset=function_of_decision_section,model=cohere_command",
            "legalbench:subset=international_citizenship_questions,model=cohere_command",
            "legalbench:subset=proa,model=cohere_command"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 1016.7375745526839,
          "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        },
        {
          "value": 149.45941179844013,
          "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        },
        {
          "value": 31.800405260743236,
          "description": "min=27.65, mean=31.8, max=41.789, sum=159.002 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command",
            "wmt_14:language_pair=de-en,model=cohere_command",
            "wmt_14:language_pair=fr-en,model=cohere_command",
            "wmt_14:language_pair=hi-en,model=cohere_command",
            "wmt_14:language_pair=ru-en,model=cohere_command"
          ]
        }
      ],
      [
        {
          "value": "Command Light",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1.9408450704225353,
          "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1660.4845070422534,
          "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 10.814084507042253,
          "description": "min=10.814, mean=10.814, max=10.814, sum=10.814 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 4.617,
          "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 0.039,
          "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 1557.639,
          "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 10.869,
          "description": "min=10.869, mean=10.869, max=10.869, sum=10.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-light"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 115.191,
          "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 17.348,
          "description": "min=17.348, mean=17.348, max=17.348, sum=17.348 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-light"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 246.682,
          "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 481.26021052631575,
          "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-light",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-light"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 6.877964141122035,
          "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 1177.3289276411065,
          "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 106.58875792143844,
          "description": "min=83.228, mean=106.589, max=137.692, sum=746.121 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-light"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 942.424,
          "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 80.184,
          "description": "min=80.184, mean=80.184, max=80.184, sum=80.184 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-light"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 3.8775510204081636,
          "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 0.002857142857142857,
          "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 566.5014751745068,
          "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 6.63968330089529,
          "description": "min=1.074, mean=6.64, max=23.614, sum=33.198 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-light",
            "legalbench:subset=corporate_lobbying,model=cohere_command-light",
            "legalbench:subset=function_of_decision_section,model=cohere_command-light",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-light",
            "legalbench:subset=proa,model=cohere_command-light"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1016.7375745526839,
          "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-light"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        },
        {
          "value": 149.45941179844013,
          "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        },
        {
          "value": 39.88511765942805,
          "description": "min=30.895, mean=39.885, max=47.65, sum=199.426 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-light",
            "wmt_14:language_pair=de-en,model=cohere_command-light",
            "wmt_14:language_pair=fr-en,model=cohere_command-light",
            "wmt_14:language_pair=hi-en,model=cohere_command-light",
            "wmt_14:language_pair=ru-en,model=cohere_command-light"
          ]
        }
      ],
      [
        {
          "value": "Command R",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 3442.6535211267606,
          "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 2069.055,
          "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 160.159,
          "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 260.678,
          "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 499.48978947368425,
          "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 1406.1074103714861,
          "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 1158.893,
          "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 1582.6169819753743,
          "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r",
            "legalbench:subset=proa,model=cohere_command-r"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 1062.9045725646124,
          "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        },
        {
          "value": 127.94422599021257,
          "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r",
            "wmt_14:language_pair=de-en,model=cohere_command-r",
            "wmt_14:language_pair=fr-en,model=cohere_command-r",
            "wmt_14:language_pair=hi-en,model=cohere_command-r",
            "wmt_14:language_pair=ru-en,model=cohere_command-r"
          ]
        }
      ],
      [
        {
          "value": "Command R Plus",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 3442.6535211267606,
          "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 2069.055,
          "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 160.159,
          "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 260.678,
          "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 499.48978947368425,
          "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=cohere_command-r-plus",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1406.1074103714861,
          "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 1158.893,
          "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=cohere_command-r-plus,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1582.6169819753743,
          "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=cohere_command-r-plus",
            "legalbench:subset=corporate_lobbying,model=cohere_command-r-plus",
            "legalbench:subset=function_of_decision_section,model=cohere_command-r-plus",
            "legalbench:subset=international_citizenship_questions,model=cohere_command-r-plus",
            "legalbench:subset=proa,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1062.9045725646124,
          "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=cohere_command-r-plus"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 127.94422599021257,
          "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=de-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=fr-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=hi-en,model=cohere_command-r-plus",
            "wmt_14:language_pair=ru-en,model=cohere_command-r-plus"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.0 Pro (002)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 3447.994366197183,
          "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1978.347,
          "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 153.995,
          "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 248.508,
          "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 481.5305263157895,
          "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.0-pro-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.0-pro-002,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1558.2386051001386,
          "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.0-pro-002",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.0-pro-002",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.0-pro-002",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.0-pro-002",
            "legalbench:subset=proa,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1029.4811133200794,
          "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 120.97025108961614,
          "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=de-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.0-pro-002",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.0-pro-002"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Pro (001)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 3447.994366197183,
          "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1978.347,
          "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 153.995,
          "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 248.508,
          "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 481.5305263157895,
          "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-pro-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-pro-001,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1558.2386051001386,
          "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-pro-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-pro-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-pro-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-pro-001",
            "legalbench:subset=proa,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1029.4811133200794,
          "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 120.97025108961614,
          "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-pro-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-pro-001"
          ]
        }
      ],
      [
        {
          "value": "Gemini 1.5 Flash (001)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 3447.994366197183,
          "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1978.347,
          "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 153.995,
          "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 248.508,
          "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 481.5305263157895,
          "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-1.5-flash-001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1355.5064552904823,
          "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 1151.885,
          "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_gemini-1.5-flash-001,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1558.2386051001386,
          "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_gemini-1.5-flash-001",
            "legalbench:subset=corporate_lobbying,model=google_gemini-1.5-flash-001",
            "legalbench:subset=function_of_decision_section,model=google_gemini-1.5-flash-001",
            "legalbench:subset=international_citizenship_questions,model=google_gemini-1.5-flash-001",
            "legalbench:subset=proa,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1029.4811133200794,
          "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 120.97025108961614,
          "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=de-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=fr-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=hi-en,model=google_gemini-1.5-flash-001",
            "wmt_14:language_pair=ru-en,model=google_gemini-1.5-flash-001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Bison)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 4414.2338028169015,
          "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 7.997183098591549,
          "description": "min=7.997, mean=7.997, max=7.997, sum=7.997 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 4.906,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.015,
          "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 2124.565,
          "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 7.358,
          "description": "min=7.358, mean=7.358, max=7.358, sum=7.358 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-bison@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 190.187,
          "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 4.48,
          "description": "min=4.48, mean=4.48, max=4.48, sum=4.48 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-bison@001"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 253.308,
          "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 487.29354385964905,
          "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 1439.842989280994,
          "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 66.89023408252294,
          "description": "min=38.4, mean=66.89, max=88.316, sum=468.232 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-bison@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 1109.549,
          "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 94.258,
          "description": "min=94.258, mean=94.258, max=94.258, sum=94.258 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-bison@001"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 4.397551020408163,
          "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 1387.966233478402,
          "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.3893499784884555,
          "description": "min=1, mean=1.389, max=2.347, sum=6.947 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-bison@001",
            "legalbench:subset=corporate_lobbying,model=google_text-bison@001",
            "legalbench:subset=function_of_decision_section,model=google_text-bison@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-bison@001",
            "legalbench:subset=proa,model=google_text-bison@001"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 1138.6222664015904,
          "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-bison@001"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        },
        {
          "value": 183.58714444104604,
          "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        },
        {
          "value": 29.980943664933477,
          "description": "min=28.076, mean=29.981, max=31.366, sum=149.905 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-bison@001",
            "wmt_14:language_pair=de-en,model=google_text-bison@001",
            "wmt_14:language_pair=fr-en,model=google_text-bison@001",
            "wmt_14:language_pair=hi-en,model=google_text-bison@001",
            "wmt_14:language_pair=ru-en,model=google_text-bison@001"
          ]
        }
      ],
      [
        {
          "value": "PaLM-2 (Unicorn)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 4414.2338028169015,
          "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 16.543661971830986,
          "description": "min=16.544, mean=16.544, max=16.544, sum=16.544 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 4.906,
          "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.015,
          "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 2124.565,
          "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 13.327,
          "description": "min=13.327, mean=13.327, max=13.327, sum=13.327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 190.187,
          "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 9.803,
          "description": "min=9.803, mean=9.803, max=9.803, sum=9.803 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 253.308,
          "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 487.29354385964905,
          "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1439.842989280994,
          "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 80.45819114472725,
          "description": "min=59.9, mean=80.458, max=98.342, sum=563.207 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1109.549,
          "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 93.764,
          "description": "min=93.764, mean=93.764, max=93.764, sum=93.764 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 4.397551020408163,
          "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1387.966233478402,
          "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.3642506811989101,
          "description": "min=1, mean=1.364, max=2.2, sum=6.821 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=google_text-unicorn@001",
            "legalbench:subset=corporate_lobbying,model=google_text-unicorn@001",
            "legalbench:subset=function_of_decision_section,model=google_text-unicorn@001",
            "legalbench:subset=international_citizenship_questions,model=google_text-unicorn@001",
            "legalbench:subset=proa,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1138.6222664015904,
          "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=google_text-unicorn@001"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 183.58714444104604,
          "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        },
        {
          "value": 30.567241263954735,
          "description": "min=28.596, mean=30.567, max=31.734, sum=152.836 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=de-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=fr-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=hi-en,model=google_text-unicorn@001",
            "wmt_14:language_pair=ru-en,model=google_text-unicorn@001"
          ]
        }
      ],
      [
        {
          "value": "Yi Large (Preview)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 3724.042253521127,
          "description": "min=3724.042, mean=3724.042, max=3724.042, sum=3724.042 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 21.512676056338027,
          "description": "min=21.513, mean=21.513, max=21.513, sum=21.513 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 4.983,
          "description": "min=4.983, mean=4.983, max=4.983, sum=4.983 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.003,
          "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 2368.513,
          "description": "min=2368.513, mean=2368.513, max=2368.513, sum=2368.513 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 23.703,
          "description": "min=23.703, mean=23.703, max=23.703, sum=23.703 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 160.695,
          "description": "min=160.695, mean=160.695, max=160.695, sum=160.695 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 4.629,
          "description": "min=4.629, mean=4.629, max=4.629, sum=4.629 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 273.002,
          "description": "min=273.002, mean=273.002, max=273.002, sum=273.002 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 515.6538947368421,
          "description": "min=396.67, mean=515.654, max=680.789, sum=2578.269 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-large-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1468.9352369693863,
          "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 254.00484808722263,
          "description": "min=189.756, mean=254.005, max=296.346, sum=1778.034 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 1170.814,
          "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 288.079,
          "description": "min=288.079, mean=288.079, max=288.079, sum=288.079 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=01-ai_yi-large-preview,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1656.0949044887425,
          "description": "min=228.779, mean=1656.095, max=6814.4, sum=8280.475 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 3.339402150569105,
          "description": "min=1, mean=3.339, max=6.263, sum=16.697 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=01-ai_yi-large-preview",
            "legalbench:subset=corporate_lobbying,model=01-ai_yi-large-preview",
            "legalbench:subset=function_of_decision_section,model=01-ai_yi-large-preview",
            "legalbench:subset=international_citizenship_questions,model=01-ai_yi-large-preview",
            "legalbench:subset=proa,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1135.3916500994035,
          "description": "min=1135.392, mean=1135.392, max=1135.392, sum=1135.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 205.09213851506343,
          "description": "min=157.298, mean=205.092, max=335.56, sum=1025.461 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        },
        {
          "value": 29.058130065759293,
          "description": "min=24.773, mean=29.058, max=36.698, sum=145.291 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=de-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=fr-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=hi-en,model=01-ai_yi-large-preview",
            "wmt_14:language_pair=ru-en,model=01-ai_yi-large-preview"
          ]
        }
      ],
      [
        {
          "value": "Mistral Small (2402)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 3955.3295774647886,
          "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 21.774647887323944,
          "description": "min=21.775, mean=21.775, max=21.775, sum=21.775 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 2537.79,
          "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 21.017,
          "description": "min=21.017, mean=21.017, max=21.017, sum=21.017 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 211.069,
          "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 27.473,
          "description": "min=27.473, mean=27.473, max=27.473, sum=27.473 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-small-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1525.2664139976257,
          "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 125.52572529016837,
          "description": "min=104.221, mean=125.526, max=154.904, sum=878.68 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1255.268,
          "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 148.06,
          "description": "min=148.06, mean=148.06, max=148.06, sum=148.06 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 312.4,
          "description": "min=50, mean=312.4, max=1000, sum=1562 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1783.2521685070988,
          "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 12.778290319804961,
          "description": "min=1.716, mean=12.778, max=30, sum=63.891 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-small-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-small-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-small-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-small-2402",
            "legalbench:subset=proa,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 212.43317355482492,
          "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        },
        {
          "value": 28.29957084416578,
          "description": "min=26.479, mean=28.3, max=29.024, sum=141.498 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-small-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-small-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Medium (2312)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 3955.3295774647886,
          "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 24.88450704225352,
          "description": "min=24.885, mean=24.885, max=24.885, sum=24.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 2537.79,
          "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 32.377,
          "description": "min=32.377, mean=32.377, max=32.377, sum=32.377 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 211.069,
          "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 34.263,
          "description": "min=34.263, mean=34.263, max=34.263, sum=34.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.968,
          "description": "min=0.968, mean=0.968, max=0.968, sum=0.968 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.9702456140350877,
          "description": "min=0.93, mean=0.97, max=0.991, sum=4.851 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-medium-2312",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1525.2664139976257,
          "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 113.3278270462481,
          "description": "min=80, mean=113.328, max=132.25, sum=793.295 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1255.268,
          "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 137.554,
          "description": "min=137.554, mean=137.554, max=137.554, sum=137.554 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1801.7482458432964,
          "description": "min=287.453, mean=1801.748, max=7275.488, sum=9008.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 8.47642872361909,
          "description": "min=1.008, mean=8.476, max=25.305, sum=42.382 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-medium-2312",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-medium-2312",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-medium-2312",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-medium-2312",
            "legalbench:subset=proa,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.9502982107355865,
          "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 212.43317355482492,
          "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        },
        {
          "value": 27.81599632971402,
          "description": "min=26.33, mean=27.816, max=30.692, sum=139.08 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-medium-2312",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-medium-2312"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large (2402)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 3955.3295774647886,
          "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 22.614084507042254,
          "description": "min=22.614, mean=22.614, max=22.614, sum=22.614 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 2537.79,
          "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 27.337,
          "description": "min=27.337, mean=27.337, max=27.337, sum=27.337 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 211.069,
          "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 27.91,
          "description": "min=27.91, mean=27.91, max=27.91, sum=27.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 280.15,
          "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 523.0910877192983,
          "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2402",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1525.2664139976257,
          "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 128.21647245723133,
          "description": "min=101.444, mean=128.216, max=154.897, sum=897.515 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1255.268,
          "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 129.185,
          "description": "min=129.185, mean=129.185, max=129.185, sum=129.185 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 312.4,
          "description": "min=50, mean=312.4, max=1000, sum=1562 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1783.2521685070988,
          "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 8.217420478990393,
          "description": "min=1.005, mean=8.217, max=25.86, sum=41.087 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2402",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2402",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2402",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2402",
            "legalbench:subset=proa,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1193.0934393638172,
          "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 212.43317355482492,
          "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        },
        {
          "value": 29.04227089386756,
          "description": "min=27.272, mean=29.042, max=29.871, sum=145.211 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2402",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2402"
          ]
        }
      ],
      [
        {
          "value": "Mistral Large 2 (2407)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 3914.3295774647886,
          "description": "min=3914.33, mean=3914.33, max=3914.33, sum=3914.33 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 6.2,
          "description": "min=6.2, mean=6.2, max=6.2, sum=6.2 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 2488.79,
          "description": "min=2488.79, mean=2488.79, max=2488.79, sum=2488.79 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 7.994,
          "description": "min=7.994, mean=7.994, max=7.994, sum=7.994 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 162.069,
          "description": "min=162.069, mean=162.069, max=162.069, sum=162.069 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 6.496,
          "description": "min=6.496, mean=6.496, max=6.496, sum=6.496 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 290.15,
          "description": "min=290.15, mean=290.15, max=290.15, sum=290.15 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 533.0910877192983,
          "description": "min=412.44, mean=533.091, max=697.175, sum=2665.455 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mistral-large-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1456.2664139976257,
          "description": "min=992.615, mean=1456.266, max=2503.962, sum=10193.865 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 180.3187090913529,
          "description": "min=129.395, mean=180.319, max=220.298, sum=1262.231 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 1187.268,
          "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 205.748,
          "description": "min=205.748, mean=205.748, max=205.748, sum=205.748 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_mistral-large-2407,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1741.3482458432961,
          "description": "min=227.453, mean=1741.348, max=7215.488, sum=8706.741 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 3.484006654237774,
          "description": "min=1, mean=3.484, max=8.469, sum=17.42 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_mistral-large-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_mistral-large-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_mistral-large-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_mistral-large-2407",
            "legalbench:subset=proa,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1203.0934393638172,
          "description": "min=1203.093, mean=1203.093, max=1203.093, sum=1203.093 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 153.43317355482492,
          "description": "min=139.306, mean=153.433, max=172.018, sum=767.166 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        },
        {
          "value": 30.30625095580364,
          "description": "min=29.153, mean=30.306, max=33.358, sum=151.531 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=de-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_mistral-large-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_mistral-large-2407"
          ]
        }
      ],
      [
        {
          "value": "Mistral NeMo (2402)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 3527.3915492957744,
          "description": "min=3527.392, mean=3527.392, max=3527.392, sum=3527.392 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 6.901408450704225,
          "description": "min=6.901, mean=6.901, max=6.901, sum=6.901 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 2032.962,
          "description": "min=2032.962, mean=2032.962, max=2032.962, sum=2032.962 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.927,
          "description": "min=5.927, mean=5.927, max=5.927, sum=5.927 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 137.405,
          "description": "min=137.405, mean=137.405, max=137.405, sum=137.405 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 3.595,
          "description": "min=3.595, mean=3.595, max=3.595, sum=3.595 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 248.246,
          "description": "min=248.246, mean=248.246, max=248.246, sum=248.246 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 479.9241754385965,
          "description": "min=377.89, mean=479.924, max=631.851, sum=2399.621 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1317.2798769434019,
          "description": "min=915.846, mean=1317.28, max=2238.885, sum=9220.959 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 111.74533800213115,
          "description": "min=97.456, mean=111.745, max=141.433, sum=782.217 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 1134.356,
          "description": "min=1134.356, mean=1134.356, max=1134.356, sum=1134.356 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 187.859,
          "description": "min=187.859, mean=187.859, max=187.859, sum=187.859 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=mistralai_open-mistral-nemo-2407,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1561.3600575619662,
          "description": "min=200.716, mean=1561.36, max=6486.116, sum=7806.8 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 8.473099835809844,
          "description": "min=4.94, mean=8.473, max=15.796, sum=42.365 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=corporate_lobbying,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=function_of_decision_section,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=international_citizenship_questions,model=mistralai_open-mistral-nemo-2407",
            "legalbench:subset=proa,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1022.5427435387674,
          "description": "min=1022.543, mean=1022.543, max=1022.543, sum=1022.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 110.16282784064842,
          "description": "min=81.661, mean=110.163, max=135.306, sum=550.814 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        },
        {
          "value": 26.541759538920324,
          "description": "min=24.622, mean=26.542, max=27.26, sum=132.709 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=de-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=fr-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=hi-en,model=mistralai_open-mistral-nemo-2407",
            "wmt_14:language_pair=ru-en,model=mistralai_open-mistral-nemo-2407"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-003)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 4.954929577464789,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 3479.56338028169,
          "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 9.732394366197184,
          "description": "min=9.732, mean=9.732, max=9.732, sum=9.732 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 4.885,
          "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.02,
          "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1617.729,
          "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 6.8,
          "description": "min=6.8, mean=6.8, max=6.8, sum=6.8 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 116.254,
          "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 7.074,
          "description": "min=7.074, mean=7.074, max=7.074, sum=7.074 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 254.21,
          "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 472.2740350877192,
          "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-003",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1375.7353092779654,
          "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 74.93793702104595,
          "description": "min=61.333, mean=74.938, max=97.115, sum=524.566 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 938.869,
          "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 93.717,
          "description": "min=93.717, mean=93.717, max=93.717, sum=93.717 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 4.210612244897959,
          "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 907.3872120499769,
          "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.1675708408818857,
          "description": "min=1, mean=1.168, max=1.443, sum=5.838 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-003",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-003",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-003",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-003",
            "legalbench:subset=proa,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1038.8608349900596,
          "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-003"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 181.69386660804403,
          "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        },
        {
          "value": 25.117336366416882,
          "description": "min=23.563, mean=25.117, max=25.652, sum=125.587 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-003",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-003"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 (text-davinci-002)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 4.954929577464789,
          "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 3479.56338028169,
          "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 8.447887323943663,
          "description": "min=8.448, mean=8.448, max=8.448, sum=8.448 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 4.885,
          "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.02,
          "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1617.729,
          "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 6.632,
          "description": "min=6.632, mean=6.632, max=6.632, sum=6.632 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 116.254,
          "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 4.116,
          "description": "min=4.116, mean=4.116, max=4.116, sum=4.116 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 254.21,
          "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 472.2740350877192,
          "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_text-davinci-002",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1375.7353092779654,
          "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 136.82193804427587,
          "description": "min=76.721, mean=136.822, max=259.175, sum=957.754 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 938.869,
          "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 90.543,
          "description": "min=90.543, mean=90.543, max=90.543, sum=90.543 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 4.210612244897959,
          "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 907.3872120499769,
          "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1.0991972687655298,
          "description": "min=0.996, mean=1.099, max=1.238, sum=5.496 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_text-davinci-002",
            "legalbench:subset=corporate_lobbying,model=openai_text-davinci-002",
            "legalbench:subset=function_of_decision_section,model=openai_text-davinci-002",
            "legalbench:subset=international_citizenship_questions,model=openai_text-davinci-002",
            "legalbench:subset=proa,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1038.8608349900596,
          "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_text-davinci-002"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 181.69386660804403,
          "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        },
        {
          "value": 24.86174013610644,
          "description": "min=23.557, mean=24.862, max=25.636, sum=124.309 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=de-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=fr-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=hi-en,model=openai_text-davinci-002",
            "wmt_14:language_pair=ru-en,model=openai_text-davinci-002"
          ]
        }
      ],
      [
        {
          "value": "GPT-3.5 Turbo (0613)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 4.946478873239436,
          "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 3493.6619718309857,
          "description": "min=3493.662, mean=3493.662, max=3493.662, sum=3493.662 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 9.909859154929578,
          "description": "min=9.91, mean=9.91, max=9.91, sum=9.91 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 4.884,
          "description": "min=4.884, mean=4.884, max=4.884, sum=4.884 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.019,
          "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1649.552,
          "description": "min=1649.552, mean=1649.552, max=1649.552, sum=1649.552 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 9.389,
          "description": "min=9.389, mean=9.389, max=9.389, sum=9.389 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 173.127,
          "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.576,
          "description": "min=5.576, mean=5.576, max=5.576, sum=5.576 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 242.782,
          "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 460.71996491228066,
          "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1323.910874184069,
          "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 60.844003793024605,
          "description": "min=53.5, mean=60.844, max=77.4, sum=425.908 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1020.035,
          "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 77.29,
          "description": "min=77.29, mean=77.29, max=77.29, sum=77.29 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 4.21795918367347,
          "description": "min=2.09, mean=4.218, max=5, sum=21.09 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 949.5172570702738,
          "description": "min=253.442, mean=949.517, max=3254.159, sum=4747.586 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1.3868394951957552,
          "description": "min=1, mean=1.387, max=2.032, sum=6.934 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-3.5-turbo-0613",
            "legalbench:subset=proa,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1020.4135188866799,
          "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 193.04258583116683,
          "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        },
        {
          "value": 25.038384118366725,
          "description": "min=21.983, mean=25.038, max=26.352, sum=125.192 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-3.5-turbo-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-3.5-turbo-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (1106 preview)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 3522.6704225352114,
          "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 9.88450704225352,
          "description": "min=9.885, mean=9.885, max=9.885, sum=9.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1762.593,
          "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 8.753,
          "description": "min=8.753, mean=8.753, max=8.753, sum=8.753 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 173.127,
          "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 14.157,
          "description": "min=14.157, mean=14.157, max=14.157, sum=14.157 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 242.782,
          "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 460.71996491228066,
          "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1323.910874184069,
          "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 161.87607288445722,
          "description": "min=122.465, mean=161.876, max=186.673, sum=1133.133 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1020.035,
          "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 98.073,
          "description": "min=98.073, mean=98.073, max=98.073, sum=98.073 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1570.162971355988,
          "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.458208948802524,
          "description": "min=1, mean=1.458, max=2.695, sum=7.291 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-1106-preview",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-1106-preview",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-1106-preview",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-1106-preview",
            "legalbench:subset=proa,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1020.4135188866799,
          "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 193.04258583116683,
          "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        },
        {
          "value": 26.995945480960394,
          "description": "min=26.229, mean=26.996, max=28.59, sum=134.98 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-1106-preview",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-1106-preview"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 (0613)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 3522.6704225352114,
          "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 8.51549295774648,
          "description": "min=8.515, mean=8.515, max=8.515, sum=8.515 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 4.964,
          "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1717.847,
          "description": "min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 8.055,
          "description": "min=8.055, mean=8.055, max=8.055, sum=8.055 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 173.127,
          "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 3.832,
          "description": "min=3.832, mean=3.832, max=3.832, sum=3.832 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 242.782,
          "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 460.71996491228066,
          "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-0613",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1323.910874184069,
          "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 73.25695858608955,
          "description": "min=59.674, mean=73.257, max=81.1, sum=512.799 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1020.035,
          "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 111.209,
          "description": "min=111.209, mean=111.209, max=111.209, sum=111.209 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1568.6870529886412,
          "description": "min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.3396070557866055,
          "description": "min=1, mean=1.34, max=2.063, sum=6.698 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-0613",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-0613",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-0613",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-0613",
            "legalbench:subset=proa,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1020.4135188866799,
          "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 193.04258583116683,
          "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        },
        {
          "value": 25.424382072946933,
          "description": "min=23.767, mean=25.424, max=26.121, sum=127.122 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-0613",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-0613"
          ]
        }
      ],
      [
        {
          "value": "GPT-4 Turbo (2024-04-09)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 3495.6704225352114,
          "description": "min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 6.0366197183098596,
          "description": "min=6.037, mean=6.037, max=6.037, sum=6.037 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1728.593,
          "description": "min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.902,
          "description": "min=5.902, mean=5.902, max=5.902, sum=5.902 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 139.127,
          "description": "min=139.127, mean=139.127, max=139.127, sum=139.127 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.263,
          "description": "min=5.263, mean=5.263, max=5.263, sum=5.263 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 249.782,
          "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 467.71996491228066,
          "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1262.9108741840687,
          "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 189.56082409362702,
          "description": "min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 959.035,
          "description": "min=959.035, mean=959.035, max=959.035, sum=959.035 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 141.712,
          "description": "min=141.712, mean=141.712, max=141.712, sum=141.712 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4-turbo-2024-04-09,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1524.162971355988,
          "description": "min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.3251168793919403,
          "description": "min=1, mean=1.325, max=2.032, sum=6.626 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4-turbo-2024-04-09",
            "legalbench:subset=proa,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1027.4135188866799,
          "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 148.04258583116683,
          "description": "min=124.901, mean=148.043, max=168.185, sum=740.213 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        },
        {
          "value": 25.26444840571953,
          "description": "min=23.744, mean=25.264, max=25.938, sum=126.322 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=de-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4-turbo-2024-04-09",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4-turbo-2024-04-09"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o (2024-05-13)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 3461.667605633803,
          "description": "min=3461.668, mean=3461.668, max=3461.668, sum=3461.668 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.619718309859155,
          "description": "min=4.62, mean=4.62, max=4.62, sum=4.62 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1724.02,
          "description": "min=1724.02, mean=1724.02, max=1724.02, sum=1724.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.41,
          "description": "min=5.41, mean=5.41, max=5.41, sum=5.41 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 139.953,
          "description": "min=139.953, mean=139.953, max=139.953, sum=139.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.245,
          "description": "min=4.245, mean=4.245, max=4.245, sum=4.245 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 245.486,
          "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 466.9916140350877,
          "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1273.320452019534,
          "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 245.4823665454633,
          "description": "min=187.942, mean=245.482, max=284.788, sum=1718.377 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 952.617,
          "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 213.475,
          "description": "min=213.475, mean=213.475, max=213.475, sum=213.475 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-2024-05-13,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1512.7954037538377,
          "description": "min=208.179, mean=1512.795, max=6254.98, sum=7563.977 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.2488971748171518,
          "description": "min=1, mean=1.249, max=2.021, sum=6.244 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-2024-05-13",
            "legalbench:subset=proa,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1009.0497017892644,
          "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 115.00557042361216,
          "description": "min=79.529, mean=115.006, max=138.497, sum=575.028 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        },
        {
          "value": 25.286879683437835,
          "description": "min=23.62, mean=25.287, max=26.018, sum=126.434 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-2024-05-13",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-2024-05-13"
          ]
        }
      ],
      [
        {
          "value": "GPT-4o mini (2024-07-18)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 3451.667605633803,
          "description": "min=3451.668, mean=3451.668, max=3451.668, sum=3451.668 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.48169014084507,
          "description": "min=4.482, mean=4.482, max=4.482, sum=4.482 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1714.02,
          "description": "min=1714.02, mean=1714.02, max=1714.02, sum=1714.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.175,
          "description": "min=5.175, mean=5.175, max=5.175, sum=5.175 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 129.953,
          "description": "min=129.953, mean=129.953, max=129.953, sum=129.953 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.847,
          "description": "min=4.847, mean=4.847, max=4.847, sum=4.847 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 245.486,
          "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 466.9916140350877,
          "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1273.320452019534,
          "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 238.23525019565412,
          "description": "min=167.884, mean=238.235, max=276.058, sum=1667.647 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 952.617,
          "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 215.465,
          "description": "min=215.465, mean=215.465, max=215.465, sum=215.465 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=openai_gpt-4o-mini-2024-07-18,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 4.8,
          "description": "min=4, mean=4.8, max=5, sum=24 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1502.7954037538377,
          "description": "min=198.179, mean=1502.795, max=6244.98, sum=7513.977 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.2930331277785745,
          "description": "min=1, mean=1.293, max=2.253, sum=6.465 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=corporate_lobbying,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=function_of_decision_section,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=international_citizenship_questions,model=openai_gpt-4o-mini-2024-07-18",
            "legalbench:subset=proa,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1009.0497017892644,
          "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 105.00557042361216,
          "description": "min=69.529, mean=105.006, max=128.497, sum=525.028 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        },
        {
          "value": 25.504310196513227,
          "description": "min=23.748, mean=25.504, max=26.235, sum=127.522 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=de-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=fr-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=hi-en,model=openai_gpt-4o-mini-2024-07-18",
            "wmt_14:language_pair=ru-en,model=openai_gpt-4o-mini-2024-07-18"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V2 (33B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 3504.5774647887324,
          "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 8.208450704225353,
          "description": "min=8.208, mean=8.208, max=8.208, sum=8.208 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 4.926,
          "description": "min=4.926, mean=4.926, max=4.926, sum=4.926 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.013,
          "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1662.782,
          "description": "min=1662.782, mean=1662.782, max=1662.782, sum=1662.782 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 7.809,
          "description": "min=7.809, mean=7.809, max=7.809, sum=7.809 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 116.254,
          "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 7.067,
          "description": "min=7.067, mean=7.067, max=7.067, sum=7.067 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 254.21,
          "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 472.2740350877192,
          "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v2",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1375.7353092779654,
          "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 87.03154467364993,
          "description": "min=64, mean=87.032, max=107.385, sum=609.221 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 938.869,
          "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 89.718,
          "description": "min=89.718, mean=89.718, max=89.718, sum=89.718 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 4.596734693877551,
          "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1355.7586406214054,
          "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 2.0771673311343752,
          "description": "min=1, mean=2.077, max=5.406, sum=10.386 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v2",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v2",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v2",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v2",
            "legalbench:subset=proa,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1038.8608349900596,
          "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 181.69386660804403,
          "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        },
        {
          "value": 25.14180111637865,
          "description": "min=23.829, mean=25.142, max=25.958, sum=125.709 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v2",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v2"
          ]
        }
      ],
      [
        {
          "value": "Palmyra X V3 (72B)",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 3504.5774647887324,
          "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 11.149295774647888,
          "description": "min=11.149, mean=11.149, max=11.149, sum=11.149 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 4.885,
          "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.02,
          "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1617.709,
          "description": "min=1617.709, mean=1617.709, max=1617.709, sum=1617.709 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 12.864,
          "description": "min=12.864, mean=12.864, max=12.864, sum=12.864 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 116.254,
          "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 19.113,
          "description": "min=19.113, mean=19.113, max=19.113, sum=19.113 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 254.21,
          "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 472.2740350877192,
          "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-v3",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1375.7353092779654,
          "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 83.13468064416656,
          "description": "min=60.012, mean=83.135, max=128.942, sum=581.943 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 938.869,
          "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 89.919,
          "description": "min=89.919, mean=89.919, max=89.919, sum=89.919 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 4.596734693877551,
          "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1355.7586406214054,
          "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.0776021798365123,
          "description": "min=1, mean=1.078, max=1.2, sum=5.388 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-v3",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-v3",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-v3",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-v3",
            "legalbench:subset=proa,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1038.8608349900596,
          "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=1 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 181.69386660804403,
          "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        },
        {
          "value": 24.983090877810064,
          "description": "min=23.356, mean=24.983, max=25.829, sum=124.915 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-v3",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-v3"
          ]
        }
      ],
      [
        {
          "value": "Palmyra-X-004",
          "description": "",
          "markdown": false
        },
        {
          "markdown": false
        },
        {
          "value": 355.0,
          "description": "min=355, mean=355, max=355, sum=355 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 3484.2676056338028,
          "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 6.338028169014085,
          "description": "min=6.338, mean=6.338, max=6.338, sum=6.338 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "narrative_qa:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 4.965,
          "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.007,
          "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1675.231,
          "description": "min=1675.231, mean=1675.231, max=1675.231, sum=1675.231 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 10.295,
          "description": "min=10.295, mean=10.295, max=10.295, sum=10.295 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=openbook_longans,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 129.12,
          "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 12.549,
          "description": "min=12.549, mean=12.549, max=12.549, sum=12.549 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "natural_qa:mode=closedbook,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 500.0,
          "description": "min=500, mean=500, max=500, sum=500 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 249.776,
          "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.992,
          "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "commonsense:dataset=openbookqa,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 102.8,
          "description": "min=100, mean=102.8, max=114, sum=514 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=25 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 467.6862105263158,
          "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.9902456140350877,
          "description": "min=0.97, mean=0.99, max=1, sum=4.951 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=computer_security,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=econometrics,method=multiple_choice_joint,model=writer_palmyra-x-004",
            "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 62.42857142857143,
          "description": "min=30, mean=62.429, max=135, sum=437 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 8.0,
          "description": "min=8, mean=8, max=8, sum=56 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1262.9092130545007,
          "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 209.3327932233685,
          "description": "min=174.547, mean=209.333, max=238.692, sum=1465.33 (7)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "math:subject=algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=counting_and_probability,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=geometry,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=intermediate_algebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=number_theory,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=prealgebra,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none",
            "math:subject=precalculus,level=1,use_official_examples=False,use_chain_of_thought=True,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1000.0,
          "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 959.032,
          "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 174.327,
          "description": "min=174.327, mean=174.327, max=174.327, sum=174.327 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "gsm:model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 409.4,
          "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 4.798367346938775,
          "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1524.206501356544,
          "description": "min=216.442, mean=1524.207, max=6297.633, sum=7621.033 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1.4163162483866343,
          "description": "min=1, mean=1.416, max=2.021, sum=7.082 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "legalbench:subset=abercrombie,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=corporate_lobbying,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=function_of_decision_section,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=international_citizenship_questions,model=writer_palmyra-x-004,stop=none",
            "legalbench:subset=proa,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 503.0,
          "description": "min=503, mean=503, max=503, sum=503 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 5.0,
          "description": "min=5, mean=5, max=5, sum=5 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 1025.2743538767395,
          "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 0.9920477137176938,
          "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "med_qa:model=writer_palmyra-x-004"
          ]
        },
        {
          "value": 568.8,
          "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 1.0,
          "description": "min=1, mean=1, max=1, sum=5 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 0.0,
          "description": "min=0, mean=0, max=0, sum=0 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 115.71178123566294,
          "description": "min=96.139, mean=115.712, max=136.117, sum=578.559 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        },
        {
          "value": 29.36160106667686,
          "description": "min=26.191, mean=29.362, max=37.718, sum=146.808 (5)",
          "style": {},
          "markdown": false,
          "run_spec_names": [
            "wmt_14:language_pair=cs-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=de-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=fr-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=hi-en,model=writer_palmyra-x-004,stop=none",
            "wmt_14:language_pair=ru-en,model=writer_palmyra-x-004,stop=none"
          ]
        }
      ]
    ],
    "links": [
      {
        "text": "LaTeX",
        "href": "benchmark_output/releases/v1.9.0/groups/latex/core_scenarios_general_information.tex"
      },
      {
        "text": "JSON",
        "href": "benchmark_output/releases/v1.9.0/groups/json/core_scenarios_general_information.json"
      }
    ],
    "name": "general_information"
  }
]