markitdown/tests/test_files/test_blog.html
2024-11-13 13:00:01 -08:00

70 lines
25 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!doctype html>
<html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.1.1">
<title data-rh="true">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"><meta data-rh="true" name="description" content="level 2 algebra"><meta data-rh="true" property="og:description" content="level 2 algebra"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2023-04-21T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://www.linkedin.com/in/chi-wang-49b15b16/"><meta data-rh="true" property="article:tag" content="LLM,GPT,research"><link data-rh="true" rel="icon" href="/autogen/img/ag.ico"><link data-rh="true" rel="canonical" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="en"><link data-rh="true" rel="alternate" href="https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math" hreflang="x-default"><link rel="alternate" type="application/rss+xml" href="/autogen/blog/rss.xml" title="AutoGen RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/autogen/blog/atom.xml" title="AutoGen Atom Feed">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.13.11/dist/katex.min.css" integrity="sha384-Um5gpz1odJg5Z4HAmzPtgZKdTBHZdw8S29IecapCSB31ligYPhHQZMIlWLYQGVoc" crossorigin="anonymous">
<script src="/autogen/js/custom.js" async defer="defer"></script><link rel="stylesheet" href="/autogen/assets/css/styles.ca10f300.css">
<script src="/autogen/assets/js/runtime~main.83ab9fec.js" defer="defer"></script>
<script src="/autogen/assets/js/main.5d28c826.js" defer="defer"></script>
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const a=new URLSearchParams(window.location.search).entries();for(var[t,e]of a)if(t.startsWith("docusaurus-data-")){var n=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(n,e)}}catch(t){}}(),document.documentElement.setAttribute("data-announcement-bar-initially-dismissed",function(){try{return"true"===localStorage.getItem("docusaurus.announcement.dismiss")}catch(t){}return!1}())</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><div class="announcementBar_mb4j" style="background-color:#fafbfc;color:#091E42" role="banner"><div class="announcementBarPlaceholder_vyr4"></div><div class="content_knG7 announcementBarContent_xLdY">What's new in AutoGen? Read <a href="/autogen/blog/2024/03/03/AutoGen-Update">this blog</a> for an overview of updates</div><button type="button" aria-label="Close" class="clean-btn close closeButton_CVFx announcementBarClose_gvF7"><svg viewBox="0 0 15 15" width="14" height="14"><g stroke="currentColor" stroke-width="3.1"><path d="M.75.75l13.5 13.5M14.25.75L.75 14.25"></path></g></svg></button></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/autogen/"><div class="navbar__logo"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--light_NVdE"><img src="/autogen/img/ag.svg" alt="AutoGen" class="themedComponent_mlkZ themedComponent--dark_xIcU"></div><b class="navbar__title text--truncate">AutoGen</b></a><a class="navbar__item navbar__link" href="/autogen/docs/Getting-Started">Docs</a><a class="navbar__item navbar__link" href="/autogen/docs/reference/agentchat/conversable_agent">API</a><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/autogen/blog">Blog</a><a class="navbar__item navbar__link" href="/autogen/docs/FAQ">FAQ</a><a class="navbar__item navbar__link" href="/autogen/docs/Examples">Examples</a><a class="navbar__item navbar__link" href="/autogen/docs/notebooks">Notebooks</a><a class="navbar__item navbar__link" href="/autogen/docs/Gallery">Gallery</a></div><div class="navbar__items navbar__items--right"><div class="navbar__item dropdown dropdown--hoverable dropdown--right"><a href="#" aria-haspopup="true" aria-expanded="false" role="button" class="navbar__link">Other Languages</a><ul class="dropdown__menu"><li><a href="https://microsoft.github.io/autogen-for-net/" target="_blank" rel="noopener noreferrer" class="dropdown__link">Dotnet<svg width="12" height="12" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div><a href="https://github.com/microsoft/autogen" target="_blank" rel="noopener noreferrer" class="navbar__item navbar__link">GitHub<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a><div class="toggle_vylO colorModeToggle_DEke"><button class="clean-btn toggleButton_gllP toggleButtonDisabled_aARS" type="button" disabled="" title="Switch between dark and light mode (currently light mode)" aria-label="Switch between dark and light mode (currently light mode)" aria-live="polite"><svg viewBox="0 0 24 24" width="24" height="24" class="lightToggleIcon_pyhR"><path fill="currentColor" d="M12,9c1.65,0,3,1.35,3,3s-1.35,3-3,3s-3-1.35-3-3S10.35,9,12,9 M12,7c-2.76,0-5,2.24-5,5s2.24,5,5,5s5-2.24,5-5 S14.76,7,12,7L12,7z M2,13l2,0c0.55,0,1-0.45,1-1s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S1.45,13,2,13z M20,13l2,0c0.55,0,1-0.45,1-1 s-0.45-1-1-1l-2,0c-0.55,0-1,0.45-1,1S19.45,13,20,13z M11,2v2c0,0.55,0.45,1,1,1s1-0.45,1-1V2c0-0.55-0.45-1-1-1S11,1.45,11,2z M11,20v2c0,0.55,0.45,1,1,1s1-0.45,1-1v-2c0-0.55-0.45-1-1-1C11.45,19,11,19.45,11,20z M5.99,4.58c-0.39-0.39-1.03-0.39-1.41,0 c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0s0.39-1.03,0-1.41L5.99,4.58z M18.36,16.95 c-0.39-0.39-1.03-0.39-1.41,0c-0.39,0.39-0.39,1.03,0,1.41l1.06,1.06c0.39,0.39,1.03,0.39,1.41,0c0.39-0.39,0.39-1.03,0-1.41 L18.36,16.95z M19.42,5.99c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06c-0.39,0.39-0.39,1.03,0,1.41 s1.03,0.39,1.41,0L19.42,5.99z M7.05,18.36c0.39-0.39,0.39-1.03,0-1.41c-0.39-0.39-1.03-0.39-1.41,0l-1.06,1.06 c-0.39,0.39-0.39,1.03,0,1.41s1.03,0.39,1.41,0L7.05,18.36z"></path></svg><svg viewBox="0 0 24 24" width="24" height="24" class="darkToggleIcon_wfgR"><path fill="currentColor" d="M9.37,5.51C9.19,6.15,9.1,6.82,9.1,7.5c0,4.08,3.32,7.4,7.4,7.4c0.68,0,1.35-0.09,1.99-0.27C17.45,17.19,14.93,19,12,19 c-3.86,0-7-3.14-7-7C5,9.07,6.81,6.55,9.37,5.51z M12,3c-4.97,0-9,4.03-9,9s4.03,9,9,9s9-4.03,9-9c0-0.46-0.04-0.92-0.1-1.36 c-0.98,1.37-2.58,2.26-4.4,2.26c-2.98,0-5.4-2.42-5.4-5.4c0-1.81,0.89-3.42,2.26-4.4C12.92,3.04,12.46,3,12,3L12,3z"></path></svg></button></div><div class="navbarSearchContainer_Bca1"><div class="navbar__search searchBarContainer_NW3z"><input placeholder="Search" aria-label="Search" class="navbar__search-input"><div class="loadingRing_RJI3 searchBarLoadingRing_YnHq"><div></div><div></div><div></div><div></div></div><div class="searchHintContainer_Pkmr"><kbd class="searchHint_iIMx">ctrl</kbd><kbd class="searchHint_iIMx">K</kbd></div></div></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/03/03/AutoGen-Update">What&#x27;s New in AutoGen?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/29/StateFlow">StateFlow - Build LLM Workflows with Customized State-Oriented Transition Function in GroupChat</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/11/FSM-GroupChat">FSM Group Chat -- User-specified agent transitions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/02/02/AutoAnny">Anny: Assisting AutoGen Devs Via AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/26/Custom-Models">AutoGen with Custom Models: Empowering Users to Use Their Own Inference Mechanism</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/25/AutoGenBench">AutoGenBench -- A Tool for Measuring and Evaluating AutoGen Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2024/01/23/Code-execution-in-docker">Code execution is now by default inside docker container</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/29/AgentDescriptions">All About Agent Descriptions</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/23/AgentOptimizer">AgentOptimizer - An Agentic Way to Train Your LLM Agent</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/12/01/AutoGenStudio">AutoGen Studio: Interactively Explore Multi-Agent Workflows</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/26/Agent-AutoBuild">Agent AutoBuild - Automatically Building Multi-agent Systems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/20/AgentEval">How to Assess Utility of LLM-powered Applications?</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/13/OAI-assistants">AutoGen Meets GPTs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/09/EcoAssistant">EcoAssistant - Using LLM Assistants More Accurately and Affordably</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/11/06/LMM-Agent">Multimodal with GPT-4V and LLaVA</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/26/TeachableAgent">AutoGen&#x27;s Teachable Agents</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/10/18/RetrieveChat">Retrieval-Augmented Generation (RAG) Applications with AutoGen</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/07/14/Local-LLMs">Use AutoGen for Local LLMs</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/06/28/MathChat">MathChat - An Conversational Framework to Solve Math Problems</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval">Achieve More, Pay Less - Use GPT-4 Smartly</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/autogen/blog/2023/04/21/LLM-tuning-math">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="level 2 algebra"><header><h1 class="title_f1Hy" itemprop="headline">Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH</h1><div class="container_mt6G margin-vert--md"><time datetime="2023-04-21T00:00:00.000Z" itemprop="datePublished">April 21, 2023</time> · <!-- -->6 min read</div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/sonichi.png" alt="Chi Wang" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://www.linkedin.com/in/chi-wang-49b15b16/" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Chi Wang</span></a></div><small class="avatar__subtitle" itemprop="description">Principal Researcher at Microsoft Research</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><p><img decoding="async" loading="lazy" alt="level 2 algebra" src="/autogen/assets/images/level2algebra-659ba95286432d9945fc89e84d606797.png" width="575" height="469" class="img_ev3q"></p>
<p><strong>TL;DR:</strong></p>
<ul>
<li><strong>Just by tuning the inference parameters like model, number of responses, temperature etc. without changing any model weights or prompt, the baseline accuracy of untuned gpt-4 can be improved by 20% in high school math competition problems.</strong></li>
<li><strong>For easy problems, the tuned gpt-3.5-turbo model vastly outperformed untuned gpt-4 in accuracy (e.g., 90% vs. 70%) and cost efficiency. For hard problems, the tuned gpt-4 is much more accurate (e.g., 35% vs. 20%) and less expensive than untuned gpt-4.</strong></li>
<li><strong>AutoGen can help with model selection, parameter tuning, and cost-saving in LLM applications.</strong></li>
</ul>
<p>Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?</p>
<p>In this blog post, we will explore how model and inference parameter matter in LLM applications, using a case study for <a href="https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html" target="_blank" rel="noopener noreferrer">MATH</a>, a benchmark for evaluating LLMs on advanced mathematical problem solving. MATH consists of 12K math competition problems from AMC-10, AMC-12 and AIME. Each problem is accompanied by a step-by-step solution.</p>
<p>We will use AutoGen to automatically find the best model and inference parameter for LLMs on a given task and dataset given an inference budget, using a novel low-cost search &amp; pruning strategy. AutoGen currently supports all the LLMs from OpenAI, such as GPT-3.5 and GPT-4.</p>
<p>We will use AutoGen to perform model selection and inference parameter tuning. Then we compare the performance and inference cost on solving algebra problems with the untuned gpt-4. We will also analyze how different difficulty levels affect the results.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-setup">Experiment Setup<a href="#experiment-setup" class="hash-link" aria-label="Direct link to Experiment Setup" title="Direct link to Experiment Setup"></a></h2>
<p>We use AutoGen to select between the following models with a target inference budget $0.02 per instance:</p>
<ul>
<li>gpt-3.5-turbo, a relatively cheap model that powers the popular ChatGPT app</li>
<li>gpt-4, the state of the art LLM that costs more than 10 times of gpt-3.5-turbo</li>
</ul>
<p>We adapt the models using 20 examples in the train set, using the problem statement as the input and generating the solution as the output. We use the following inference parameters:</p>
<ul>
<li>temperature: The parameter that controls the randomness of the output text. A higher temperature means more diversity but less coherence. We search for the optimal temperature in the range of [0, 1].</li>
<li>top_p: The parameter that controls the probability mass of the output tokens. Only tokens with a cumulative probability less than or equal to top-p are considered. A lower top-p means more diversity but less coherence. We search for the optimal top-p in the range of [0, 1].</li>
<li>max_tokens: The maximum number of tokens that can be generated for each output. We search for the optimal max length in the range of [50, 1000].</li>
<li>n: The number of responses to generate. We search for the optimal n in the range of [1, 100].</li>
<li>prompt: We use the template: &quot;{problem} Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \boxed{{}}.&quot; where {problem} will be replaced by the math problem instance.</li>
</ul>
<p>In this experiment, when n &gt; 1, we find the answer with highest votes among all the responses and then select it as the final answer to compare with the ground truth. For example, if n = 5 and 3 of the responses contain a final answer 301 while 2 of the responses contain a final answer 159, we choose 301 as the final answer. This can help with resolving potential errors due to randomness. We use the average accuracy and average inference cost as the metric to evaluate the performance over a dataset. The inference cost of a particular instance is measured by the price per 1K tokens and the number of tokens consumed.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="experiment-results">Experiment Results<a href="#experiment-results" class="hash-link" aria-label="Direct link to Experiment Results" title="Direct link to Experiment Results"></a></h2>
<p>The first figure in this blog post shows the average accuracy and average inference cost of each configuration on the level 2 Algebra test set.</p>
<p>Surprisingly, the tuned gpt-3.5-turbo model is selected as a better model and it vastly outperforms untuned gpt-4 in accuracy (92% vs. 70%) with equal or 2.5 times higher inference budget.
The same observation can be obtained on the level 3 Algebra test set.</p>
<p><img decoding="async" loading="lazy" alt="level 3 algebra" src="/autogen/assets/images/level3algebra-94e87a683ac8832ac7ae6f41f30131a4.png" width="575" height="469" class="img_ev3q"></p>
<p>However, the selected model changes on level 4 Algebra.</p>
<p><img decoding="async" loading="lazy" alt="level 4 algebra" src="/autogen/assets/images/level4algebra-492beb22490df30d6cc258f061912dcd.png" width="580" height="469" class="img_ev3q"></p>
<p>This time gpt-4 is selected as the best model. The tuned gpt-4 achieves much higher accuracy (56% vs. 44%) and lower cost than the untuned gpt-4.
On level 5 the result is similar.</p>
<p><img decoding="async" loading="lazy" alt="level 5 algebra" src="/autogen/assets/images/level5algebra-8fba701551334296d08580b4b489fe56.png" width="575" height="469" class="img_ev3q"></p>
<p>We can see that AutoGen has found different optimal model and inference parameters for each subset of a particular level, which shows that these parameters matter in cost-sensitive LLM applications and need to be carefully tuned or adapted.</p>
<p>An example notebook to run these experiments can be found at: <a href="https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb" target="_blank" rel="noopener noreferrer">https://github.com/microsoft/FLAML/blob/v1.2.1/notebook/autogen_chatgpt.ipynb</a>. The experiments were run when AutoGen was a subpackage in FLAML.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="analysis-and-discussion">Analysis and Discussion<a href="#analysis-and-discussion" class="hash-link" aria-label="Direct link to Analysis and Discussion" title="Direct link to Analysis and Discussion"></a></h2>
<p>While gpt-3.5-turbo demonstrates competitive accuracy with voted answers in relatively easy algebra problems under the same inference budget, gpt-4 is a better choice for the most difficult problems. In general, through parameter tuning and model selection, we can identify the opportunity to save the expensive model for more challenging tasks, and improve the overall effectiveness of a budget-constrained system.</p>
<p>There are many other alternative ways of solving math problems, which we have not covered in this blog post. When there are choices beyond the inference parameters, they can be generally tuned via <a href="https://microsoft.github.io/FLAML/docs/Use-Cases/Tune-User-Defined-Function" target="_blank" rel="noopener noreferrer"><code>flaml.tune</code></a>.</p>
<p>The need for model selection, parameter tuning and cost saving is not specific to the math problems. The <a href="https://github.com/Significant-Gravitas/Auto-GPT" target="_blank" rel="noopener noreferrer">Auto-GPT</a> project is an example where high cost can easily prevent a generic complex task to be accomplished as it needs many LLM inference calls.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="for-further-reading">For Further Reading<a href="#for-further-reading" class="hash-link" aria-label="Direct link to For Further Reading" title="Direct link to For Further Reading"></a></h2>
<ul>
<li><a href="https://arxiv.org/abs/2303.04673" target="_blank" rel="noopener noreferrer">Research paper about the tuning technique</a></li>
<li><a href="/autogen/docs/Use-Cases/enhanced_inference">Documentation about inference tuning</a></li>
</ul>
<p><em>Do you have any experience to share about LLM applications? Do you like to see more support or research of LLM optimization or automation? Please join our <a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer">Discord</a> server for discussion.</em></p></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/llm">LLM</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/gpt">GPT</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/autogen/blog/tags/research">research</a></li></ul></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/autogen/blog/2023/05/18/GPT-adaptive-humaneval"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Achieve More, Pay Less - Use GPT-4 Smartly</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#experiment-setup" class="table-of-contents__link toc-highlight">Experiment Setup</a></li><li><a href="#experiment-results" class="table-of-contents__link toc-highlight">Experiment Results</a></li><li><a href="#analysis-and-discussion" class="table-of-contents__link toc-highlight">Analysis and Discussion</a></li><li><a href="#for-further-reading" class="table-of-contents__link toc-highlight">For Further Reading</a></li></ul></div></div></div></div></div><footer class="footer footer--dark"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">Community</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://discord.gg/pAbnFJrkgZ" target="_blank" rel="noopener noreferrer" class="footer__link-item">Discord<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li><li class="footer__item"><a href="https://twitter.com/pyautogen" target="_blank" rel="noopener noreferrer" class="footer__link-item">Twitter<svg width="13.5" height="13.5" aria-hidden="true" viewBox="0 0 24 24" class="iconExternalLink_nPIU"><path fill="currentColor" d="M21 13v10h-21v-19h12v2h-10v15h17v-8h2zm3-12h-10.988l4.035 4-6.977 7.07 2.828 2.828 6.977-7.07 4.125 4.172v-11z"></path></svg></a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">Copyright © 2024 AutoGen Authors | <a target="_blank" style="color:#10adff" href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy and Cookies</a></div></div></div></footer></div>
</body>
</html>